In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.io as pio
import re
import dateutil.parser
from datetime import datetime
!pip install dateparser
import dateparser
import plotly


pic_csv = "/kaggle/input/challenge-1-datasets/challenge-1-Pictures-Creators-2024-11-29.csv"
mss_csv = "/kaggle/input/challenge-1-datasets/challenge-1-Manuscripts-Creators-2024-11-29.csv"

pic_df = pd.read_csv(pic_csv)
mss_df = pd.read_csv(mss_csv)


Collecting dateparser
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting tzlocal (from dateparser)
  Downloading tzlocal-5.2-py3-none-any.whl.metadata (7.8 kB)
Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzlocal-5.2-py3-none-any.whl (17 kB)
Installing collected packages: tzlocal, dateparser
Successfully installed dateparser-1.2.0 tzlocal-5.2


## Data cleaning

Make the columns match between the two datasets

In [2]:
pic_col = pic_df.columns
mss_col = mss_df.columns

# rename the pictues columns to match the manuscripts data.
pic_df = pic_df.rename(columns={"Author":"creator", 
                                "Author (contributor)":"contributor",
                               "MMS Id": "mms_id",
                               "037 - Local Param 02":"037",
                               "Title (Complete)":"title",
                                "Publication Date":"date",
                               "300 - Local Param 05":"extent",
                               "Location Code":"852 MARC"})
pic_df['copyright'] = pic_df['540 - Local Param 09'] + pic_df['542 - Local Param 10']
pic_df.loc[:,'copyright'] = pic_df.copyright.str.replace("$$a ","").str.replace("item","work").str.replace("co*pyri?ght","copyright", regex=True).str.replace("$$l","").str.replace(".*(This work is in copyright).*","\\1",regex=True)
pic_df.loc[:,'copyright'] = pic_df.copyright.str.replace(".*(No copyright restrictions apply).*", "\\1",regex=True).str.replace(".*(Copyright undetermined).*","\\1",regex=True)
pic_df.loc[:,'copyright'] = pic_df.copyright.fillna("Unknown status")

mss_df.loc[:,'copyright'] = mss_df.copyright.str.replace("[cC]o*py?ri?ght","copyright", regex=True).str.replace("\\.$","",regex=True).str.replace("to the State","to State").str.replace("State Library of Victoria","State Library Victoria").str.replace(".*(This work is in copyright).*","\\1",regex=True).str.replace(".*(No copyright restrictions apply).*", "\\1",regex=True).str.replace(".*No part.*may be reproduced.*","This work is in copyright",regex=True).str.replace(".*N?n?ot to be reproduced.*","This work is in copyright",regex=True).str.replace(".*copyright.* assigned to State Library Victoria.*","Copyright assigned to State Library Victoria",regex=True).str.replace(".*copyright restrictions may apply.*","Copyright undetermined",regex=True)
mss_df.loc[:,'copyright'] = mss_df.copyright.str.replace("^copyright restrictions apply.*","This work is in copyright", regex=True).str.replace("This work is out of copyright","No copyright restrictions apply").str.replace("Restrictions on copying","This work is in copyright")
mss_df.loc[:,'copyright'] = mss_df.copyright.str.replace("No part to be reproduced in any way without the permission of the holding institution*","This work is in copyright", regex=True).str.replace(".*copyright .*with .*author.*","This work is in copyright", regex=True).str.replace("copyright owned by the State Library Victoria","Copyright assigned to State Library Victoria").str.replace("copyright held by the Australian War Memorial, Canberra","This work is in copyright")
mss_df.loc[:,'copyright'] = mss_df.copyright.str.replace(".*Tisdall.*", "This work is in copyright",regex=True).str.replace("Permission is granted for copying both for study and research and for publication","copyright undetermined")
mss_df.loc[:,'copyright'] = mss_df.copyright.fillna("Unknown status")

count_copyright_statuses = pic_df.copyright.unique()
pic_df.loc[:,'count_copyright']= pic_df.groupby('copyright')['copyright'].transform('count')
mss_df.loc[:,'count_copyright'] = mss_df.groupby('copyright')['copyright'].transform('count')

pic_copy_count_df = pic_df[['copyright','count_copyright']]
pic_copy_count_df = pic_copy_count_df.drop_duplicates()
pic_copy_count_df.loc[:,'collection'] = "Pictures"

mss_copy_count_df = mss_df[['copyright','count_copyright']]
mss_copy_count_df.loc[:,'collection'] = "Manuscripts"
mss_copy_count_df = mss_copy_count_df.drop_duplicates()

copyright_statuses = pd.concat([pic_copy_count_df,mss_copy_count_df])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mss_copy_count_df.loc[:,'collection'] = "Manuscripts"


In [None]:
fig = px.pie(copyright_statuses,values='count_copyright', color='copyright',facet_col='collection')
fig.show()

In [None]:
def parse_008_date(input):
    """Parse 008 from input date string. Converts all inclusive dates to questionable."""
    # Regex for stripping or determining a year value
    circa = "(?<!e)c\.|ca\.|circa|approx\.|approximately"
    year = "\d\d\d\d"

    ## Get dates from String
    years = re.findall(year, input)
    stripped_date = re.sub("\D", "", input)

    ## Try for detailed date
    detailed_date = dateparser.parse(
        re.sub(circa, "", input), languages=["en"], settings={"STRICT_PARSING": True}
    )
    if detailed_date is not None:
        return "e" + detailed_date.strftime("%Y%m%d")

    ## Try for other date types
    if len(stripped_date) == 4 and len(years) == 1:
        if len(re.sub("\W", "", input)) == 4:
            return "s" + stripped_date + "    "
        try:  ## get month.
            detailed_month = dateutil.parser.parse(re.sub("[\[\]\?]", "", input))
            return "e" + detailed_month.strftime("%Y%m  ")
        except Exception as e:
            try:
                detailed_month = datetime.strptime(input, "%b %Y").date()
            except Exception as e:
                return "s" + stripped_date + "    "
    elif len(stripped_date) == 3 and len(re.sub("\D", "", input)) == 3:
        return "s" + stripped_date + "u    "
    elif len(years) == 1 and len(stripped_date) <= 8:
        return "s" + years[0] + "    "
    elif len(stripped_date) == 8 or "between" in input or "or" in input:
        matches = re.findall(year, input)
        return "q" + "".join(matches)
    elif len(years) == 2 and len(stripped_date) > 8:
        return "q" + years[0] + years[1]
    else:
        return None

def parse_life_date(input):
    """Parse life date from input date string. Converts all inclusive dates to questionable."""
    # Regex for stripping or determining a year value
    circa = "(?<!e)c\.|ca\.|circa|approx\.|approximately"
    year = "\d\d\d\d"

    ## Get dates from String
    years = re.findall(year, input)
    stripped_date = re.sub("\D", "", input)

    ## Try for detailed date
    detailed_date = dateparser.parse(
        re.sub(circa, "", input), languages=["en"], settings={"STRICT_PARSING": True}
    )
    if detailed_date is not None:
        return "e" + detailed_date.strftime("%Y%m%d")

    ## Try for other date types
    if len(stripped_date) == 4 and len(years) == 1:
        if len(re.findall("\d\d\d\d\-", input)) == 1:
            return stripped_date +"-"
        elif len(re.findall("\-\d\d\d\d", input)) == 1:
            return "-" + stripped_date
        else:
            return None
            # return "s" + stripped_date + "    "
    elif len(stripped_date) == 3 and len(re.sub("\D", "", input)) == 3:
        return None
        # return "s" + stripped_date + "u    "
    elif len(years) == 1 and len(stripped_date) <= 8:
        return None
        # return "s" + years[0] + "    "
    elif len(stripped_date) == 8 or "between" in input or "or" in input:
        matches = re.findall(year, input)
        return "-".join(matches)
    elif len(years) == 2 and len(stripped_date) > 8:
        return years[0] + "-" + years[1]
    else:
        return None

In [None]:
print(parse_life_date('some name, 1921-1903'))

In [None]:
mss_df.loc[:,['life_dates']] = mss_df.creator.astype(str).apply(parse_life_date)
mss_df[['start', 'end']] = mss_df.life_dates.astype(str).str.split('-',n=1,expand=True)
mss_df.loc[:,['start']] = mss_df.start.replace("None",None)
mss_df.loc[:,['start']] = mss_df.start.str.strip()
mss_df.loc[:,['end']] = mss_df.end.str.strip()
mss_df.loc[:,['end']] = mss_df.end.replace("",None)
mss_df.loc[:,['end']] = mss_df.end.replace(".*\\-.*",None, regex=True)
mss_df.loc[:,['start']] = mss_df.start.replace("",None)

#df.mycol.fillna(value=np.nan, inplace=True)
mss_df.loc[:,['start']] = mss_df.start.fillna(value=-1)
mss_df.loc[:,['end']] = mss_df.end.fillna(value=-1)
mss_df.loc[:,['start']] = mss_df.start.astype(int)
mss_df.loc[:,['end']] = mss_df.end.astype(int)

mss_df.loc[:,['creator']]=mss_df.creator.str.replace("\\.$","",regex=True)
mss_df.loc[:,['creator_count']]=mss_df.groupby('creator')['creator'].transform('count')

mss_df = mss_df.sort_values(by=['start','end'])

In [None]:
fig = px.scatter(mss_df, x='end', y='start', color='copyright', hover_name='creator', range_x=[1800,2024], range_y=[1800,2024])

fig.show()

In [None]:
pic_df.loc[:,['life_dates']] = pic_df.creator.astype(str).apply(parse_life_date)
pic_df[['start', 'end']] = pic_df.life_dates.astype(str).str.split('-',n=1,expand=True)
pic_df.loc[:,['start']] = pic_df.start.replace("None",None)
pic_df.loc[:,['start']] = pic_df.start.str.strip()
pic_df.loc[:,['end']] = pic_df.end.str.strip()
pic_df.loc[:,['end']] = pic_df.end.replace("",None)
pic_df.loc[:,['end']] = pic_df.end.replace(".*\\-.*",None, regex=True)
pic_df.loc[:,['start']] = pic_df.start.replace("",None)

#df.mycol.fillna(value=np.nan, inplace=True)
pic_df.loc[:,['start']] = pic_df.start.fillna(value=-1)
pic_df.loc[:,['end']] = pic_df.end.fillna(value=-1)
pic_df.loc[:,['start']] = pic_df.start.astype(int)
pic_df.loc[:,['end']] = pic_df.end.astype(int)

pic_df = pic_df.sort_values(by=['start','end'])

In [None]:
fig = px.scatter(pic_df, x='end', y='start', color='copyright', hover_name='creator', range_x=[1800,2024], range_y=[1800,2024])

fig.show()

In [None]:
fig = px.scatter(pic_df, x='start', y='end', color='copyright', hover_name='creator', range_x=[1800,2024], range_y=[-2,2])

fig.show()

In [None]:
fig = px.scatter(mss_df, x='end', y='start', color='start', hover_name='creator', range_x=[1800,2024], range_y=[1800,2024])

fig.show()