In [1]:
import pandas
import rdflib
import glob, os, time
from tqdm import tqdm

In [None]:
def construct_nmvw_2D_table(directory):
    nmvw_2D_table = pandas.DataFrame(columns=["nmvw_uri", "pref_label", "birth_begin_time", "birth_end_time", "death_begin_time", "death_end_time"])

    try:
        for subdir, dirs, files in os.walk(directory):
            files = [f for f in files if f.endswith(".ttl")]
            for file in tqdm(sorted(files, key=lambda s: int(s.split("-")[-1].split(".")[-2]))):
                if file is None:
                    continue

                g = rdflib.Graph().parse(os.path.join(directory, file))

                if len(g) == 0:
                    continue

                q = """
                        prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> 

                        SELECT ?s ?name ?birth_begin_time ?birth_end_time ?death_begin_time ?death_end_time WHERE{
                            ?s a crm:E21_Person .
                            ?s crm:P1_is_identified_by ?name_bnode .
                            ?name_bnode crm:P2_has_type <http://vocab.getty.edu/aat/300404670> .
                            ?name_bnode crm:P190_has_symbolic_content ?name .
                        OPTIONAL{
                            ?s crm:P98i_was_born ?birth_bNode .
                            ?birth_bNode a crm:E67_Birth .
                            ?birth_bNode crm:P4_has_time-span ?b_time.
                            ?b_time a crm:E52_Time-Span .
                            ?b_time crm:P82a_begin_of_the_begin ?birth_begin_time .
                            ?b_time crm:P82b_end_of_the_end ?birth_end_time .
                        }
                        OPTIONAL{
                            ?s crm:P100i_died_in ?death_bNode .
                            ?death_bNode a crm:E69_Death .
                            ?death_bNode crm:P4_has_time-span ?d_time.
                            ?d_time a crm:E52_Time-Span .
                            ?d_time crm:P82a_begin_of_the_begin ?death_begin_time .
                            ?d_time crm:P82b_end_of_the_end ?death_end_time .
                        }
                        }
                    """

                for row in g.query(q):
                    temp_table = pandas.DataFrame([[row['s'], row['name'], row['birth_begin_time'], row['birth_end_time'], row['death_begin_time'], row['death_end_time'] ]],
                                                  columns=["nmvw_uri", "pref_label", "birth_begin_time", "birth_end_time", "death_begin_time", "death_end_time"])
                    nmvw_2D_table = pandas.concat([nmvw_2D_table, temp_table], ignore_index=True)

    finally:
        print(nmvw_2D_table.shape)
        nmvw_2D_table.to_csv('../nmvw_data/person_names.csv')
        nmvw_2D_table.to_pickle('../nmvw_data/person_names.pkl')
        

In [None]:
def count_person(file):
    df = pandas.read_csv(file)
    print(f"Total human instance found: {len(df.index)}")

In [None]:
construct_nmvw_2D_table("/Users/sarah_shoilee/PycharmProjects/entity_linking/nmvw_data/ccrdfconst")

In [28]:
count_person("../nmvw_data/person_names.csv")

Total human instance found: 39567


In [29]:
df = pandas.read_pickle("../nmvw_data/person_names.pkl")

In [30]:

print(f"Number of rows:{len(df.index)} and number of unique person: {df['nmvw_uri'].nunique()}")

Number of rows:39567 and number of unique person: 39567


In [31]:
df.dropna(subset=['birth_begin_time', 'birth_end_time',	'death_begin_time',	'death_end_time'], inplace=True)

In [32]:
print(f"Shape after dropiping none values: {df.shape}")

Shape after dropiping none values: (3671, 6)
