In [None]:
import pandas as pd

# Read CSV files
try:
    df_person_log = pd.read_csv('LLMdata/Person_Log.csv')
    df_person_changelog = pd.read_csv('LLMdata/Person_changelog.csv')
    df_person_noisy = pd.read_csv('LLMdata/Person_noisy.csv')
    df_person_s = pd.read_csv('LLMdata/Person_s.csv')
    df_hc_s = pd.read_csv('LLMdata/HealthcarePersonnel_s.csv')
    df_hc_log = pd.read_csv('LLMdata/HealthcarePersonnel_changelog.csv')
    print("Successfully read all CSV files.")
except FileNotFoundError as e:
    print(f"Error reading CSV file: {e}. Please ensure the file exists in the correct path.")
except Exception as e:
    print(f"An error occurred while reading CSV files: {e}")



Successfully read all CSV files.


In [44]:
# Install rdflib if you don't have it already
# !pip install rdflib

import rdflib
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, XSD

# Create an RDF graph
g = Graph()

# Parse the Turtle file
g.parse('src/KGs/persons_identifier_based.ttl', format='turtle')
print(f"Successfully parsed TTL file. Graph contains {len(g)} triples.")
    
# Get a list of all namespaces used in the file
print("\nNamespaces in the file:")
for ns_prefix, namespace in g.namespaces():
    print(f"{ns_prefix}: {namespace}")
    
# Define namespaces that are commonly used
SCHEMA = Namespace("http://schema.org/")
EX = Namespace("http://example.org/")
    
    


Successfully parsed TTL file. Graph contains 3555 triples.

Namespaces in the file:
owl: http://www.w3.org/2002/07/owl#
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
xsd: http://www.w3.org/2001/XMLSchema#
xml: http://www.w3.org/XML/1998/namespace
schema: https://schema.org/


In [45]:
from rdflib import Graph, Namespace, RDF, URIRef

# Load the TTL graph
g = Graph()
g.parse("src/KGs/persons_identifier_based.ttl", format="ttl")

# Define schema.org namespace
SCHEMA = Namespace("https://schema.org/")

# Your subject URI
subject = URIRef("http://example.org/Person/Person1")

# Check if it's a schema:Person
if (subject, RDF.type, SCHEMA.Person) in g:
    print("Yes! This subject is a schema:Person.")
else:
    print("No, this subject is not explicitly typed as schema:Person.")

Yes! This subject is a schema:Person.


In [46]:
df_hc_log

Unnamed: 0,row,identifier,column,original,new,variation
0,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,email,ashleywilliams@healthcare.org,ashledwilliams@healthcare.org,Email Typo
1,1,c5b86ef6-50ae-4aad-9624-ad65684ba126,email,alexbaldwin@healthcare.org,alefbaldwin@healthcare.org,Email Typo
2,1,c5b86ef6-50ae-4aad-9624-ad65684ba126,jobTitle,Midwife,Midwief,Job Title Typo
3,4,2d5b4522-d0c3-40f1-86bf-2c63179414b6,email,reneejohnson@healthcare.org,rendejohnson@healthcare.org,Email Typo
4,4,2d5b4522-d0c3-40f1-86bf-2c63179414b6,jobTitle,Obstetrician,Obstterician,Job Title Typo
...,...,...,...,...,...,...
394,391,0968c472-d01d-485f-b550-034047988cbb,jobTitle,Otolaryngologist,Otolaryngologsit,Job Title Typo
395,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,jobTitle,Otolaryngologist,Otolaryngoligist,Job Title Typo
396,393,c0dff1c7-e523-4839-a25a-60ebaaf8d073,jobTitle,Gastroenterologist,Gastroneterologist,Job Title Typo
397,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,email,gregoryjackson@healthcare.org,gregoryjaakson@healthcare.org,Email Typo


In [47]:
# Rename a single column


# Rename multiple columns
df_person_changelog = df_person_changelog.rename(columns={
    'identifier': 'original_id', 
    'column': 'field_name',
    'new': 'varied_value',
    'original': 'original_value',
    'variation': 'variation_type'
})

df_hc_log = df_hc_log.rename(columns={
    'identifier': 'original_id', 
    'column': 'field_name',
    'new': 'varied_value',
    'original': 'original_value',
    'variation': 'variation_type'
})

In [48]:
print(df_person_changelog['field_name'].unique())
print(df_hc_log['field_name'].unique())
print(df_person_log['omitted'].unique())

['knowsLanguage' 'birthDate' 'personName']
['email' 'jobTitle']
['birthDate' 'knowsLanguage' 'email' 'jobTitle']


In [49]:
df_person_changelog['entity_type'] = 'Person'
df_hc_log['entity_type'] = 'HealthcarePersonnel'

In [50]:
df_person_changelog

Unnamed: 0,row,original_id,field_name,original_value,varied_value,variation_type,entity_type
0,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,knowsLanguage,nl,Dutch,Language Representation Change,Person
1,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,birthDate,1965-05-23,23-05-1965,Birth Date Format Change,Person
2,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,personName,Ashley Williams,A. Williams,Name Abbreviation,Person
3,2,93515f5e-cfd3-42bf-8c31-47ab1f829322,personName,Christine Lewis,C. Lewis,Name Abbreviation,Person
4,2,93515f5e-cfd3-42bf-8c31-47ab1f829322,knowsLanguage,de,German,Language Representation Change,Person
...,...,...,...,...,...,...,...
524,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,knowsLanguage,nl,Dutch,Language Representation Change,Person
525,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,birthDate,1960-07-21,21-07-1960,Birth Date Format Change,Person
526,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,personName,Brandon Jackson,Jackson Brandon,Name Order Change,Person
527,393,c0dff1c7-e523-4839-a25a-60ebaaf8d073,personName,Carolyn Smith,C. Smith,Name Abbreviations,Person


In [51]:
df_hc_log

Unnamed: 0,row,original_id,field_name,original_value,varied_value,variation_type,entity_type
0,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,email,ashleywilliams@healthcare.org,ashledwilliams@healthcare.org,Email Typo,HealthcarePersonnel
1,1,c5b86ef6-50ae-4aad-9624-ad65684ba126,email,alexbaldwin@healthcare.org,alefbaldwin@healthcare.org,Email Typo,HealthcarePersonnel
2,1,c5b86ef6-50ae-4aad-9624-ad65684ba126,jobTitle,Midwife,Midwief,Job Title Typo,HealthcarePersonnel
3,4,2d5b4522-d0c3-40f1-86bf-2c63179414b6,email,reneejohnson@healthcare.org,rendejohnson@healthcare.org,Email Typo,HealthcarePersonnel
4,4,2d5b4522-d0c3-40f1-86bf-2c63179414b6,jobTitle,Obstetrician,Obstterician,Job Title Typo,HealthcarePersonnel
...,...,...,...,...,...,...,...
394,391,0968c472-d01d-485f-b550-034047988cbb,jobTitle,Otolaryngologist,Otolaryngologsit,Job Title Typo,HealthcarePersonnel
395,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,jobTitle,Otolaryngologist,Otolaryngoligist,Job Title Typo,HealthcarePersonnel
396,393,c0dff1c7-e523-4839-a25a-60ebaaf8d073,jobTitle,Gastroenterologist,Gastroneterologist,Job Title Typo,HealthcarePersonnel
397,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,email,gregoryjackson@healthcare.org,gregoryjaakson@healthcare.org,Email Typo,HealthcarePersonnel


In [52]:

# Concatenate the two changelog dataframes
# df_person_changelog and df_hc_log already have the 'entity_type' column from previous cells
df_all_changelogs = pd.concat([df_person_changelog, df_hc_log], ignore_index=True)

In [53]:
df_all_changelogs

Unnamed: 0,row,original_id,field_name,original_value,varied_value,variation_type,entity_type
0,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,knowsLanguage,nl,Dutch,Language Representation Change,Person
1,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,birthDate,1965-05-23,23-05-1965,Birth Date Format Change,Person
2,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,personName,Ashley Williams,A. Williams,Name Abbreviation,Person
3,2,93515f5e-cfd3-42bf-8c31-47ab1f829322,personName,Christine Lewis,C. Lewis,Name Abbreviation,Person
4,2,93515f5e-cfd3-42bf-8c31-47ab1f829322,knowsLanguage,de,German,Language Representation Change,Person
...,...,...,...,...,...,...,...
923,391,0968c472-d01d-485f-b550-034047988cbb,jobTitle,Otolaryngologist,Otolaryngologsit,Job Title Typo,HealthcarePersonnel
924,392,caabf1a2-e31e-4d64-9f64-d38fbeeab4a5,jobTitle,Otolaryngologist,Otolaryngoligist,Job Title Typo,HealthcarePersonnel
925,393,c0dff1c7-e523-4839-a25a-60ebaaf8d073,jobTitle,Gastroenterologist,Gastroneterologist,Job Title Typo,HealthcarePersonnel
926,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,email,gregoryjackson@healthcare.org,gregoryjaakson@healthcare.org,Email Typo,HealthcarePersonnel


In [54]:
import numpy as np # Import numpy for np.nan if pd.NA is not preferred for older pandas versions

# List to store the new rows to be added
new_rows_list = []

# Iterate through df_person_log to create new rows for omissions
for log_index, log_row in df_person_log.iterrows():
    original_id = log_row['person_uuid']
    field_name = log_row['omitted']  # This is the field that was omitted
    variation_type = 'omission'
    varied_value = ''  # As per requirement, varied_value should be empty

    entity_type = None
    original_value = pd.NA  # Use pandas' NA for missing original values

    if field_name in ['birthDate', 'knowsLanguage', 'personName']:
        entity_type = 'Person'
        # Get original value from df_person_s
        person_s_match = df_person_s[df_person_s['identifier'] == original_id]
        if not person_s_match.empty:
            if field_name in person_s_match.columns:
                original_value = person_s_match.iloc[0][field_name]
            else:
                # This case should ideally not happen if field_name is valid
                print(f"Warning: Field '{field_name}' not found in df_person_s columns for id {original_id}. Original value set to NA.")
        else:
            print(f"Warning: ID {original_id} not found in df_person_s for omitted field '{field_name}'. Original value set to NA.")

    elif field_name in ['email', 'jobTitle']:
        entity_type = 'HealthcarePersonnel'
        # Get original value from df_hc_s
        hc_s_match = df_hc_s[df_hc_s['identifier'] == original_id]
        if not hc_s_match.empty:
            if field_name in hc_s_match.columns:
                original_value = hc_s_match.iloc[0][field_name]
            else:
                # This case should ideally not happen if field_name is valid
                print(f"Warning: Field '{field_name}' not found in df_hc_s columns for id {original_id}. Original value set to NA.")
        else:
            print(f"Warning: ID {original_id} not found in df_hc_s for omitted field '{field_name}'. Original value set to NA.")
    else:
        # Handle cases where the omitted field is not one of the expected ones
        print(f"Warning: Unknown omitted field_name '{field_name}' for id {original_id}. Skipping this omission.")
        continue  # Skip this row

    # Only add the row if an entity_type was determined
    if entity_type is not None:
        new_row_dict = {
            'row': log_index,  # Using df_person_log index as the 'row' identifier
            'original_id': original_id,
            'field_name': field_name,
            'original_value': original_value,
            'varied_value': varied_value,
            'variation_type': variation_type,
            'entity_type': entity_type
        }
        new_rows_list.append(new_row_dict)

# Create a DataFrame from the list of new_rows
if new_rows_list:
    df_omissions = pd.DataFrame(new_rows_list)
    # Concatenate with the existing df_all_changelogs
    df_all_changelogs_v2 = pd.concat([df_all_changelogs, df_omissions], ignore_index=True)
    print(f"Added {len(df_omissions)} omission rows to df_all_changelogs.")
else:
    print("No omission rows to add or all omissions were skipped due to warnings.")

# Display the updated df_all_changelogs (optional)
# print(df_all_changelogs.tail())

Added 395 omission rows to df_all_changelogs.


In [59]:
df_Person_HcP = df_all_changelogs_v2.sort_values(by='row').reset_index(drop=True)
df_Person_HcP

Unnamed: 0,row,original_id,field_name,original_value,varied_value,variation_type,entity_type
0,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,knowsLanguage,nl,Dutch,Language Representation Change,Person
1,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,birthDate,1965-05-23,23-05-1965,Birth Date Format Change,Person
2,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,personName,Ashley Williams,A. Williams,Name Abbreviation,Person
3,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,email,ashleywilliams@healthcare.org,ashledwilliams@healthcare.org,Email Typo,HealthcarePersonnel
4,0,f0df3c2c-4e1a-4d07-97da-4ef05765c8e6,birthDate,1965-05-23,,omission,Person
...,...,...,...,...,...,...,...
1318,393,c0dff1c7-e523-4839-a25a-60ebaaf8d073,birthDate,1998-10-28,,omission,Person
1319,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,jobTitle,Sleep Technician,Sleep Tehcnician,Job Title Typo,HealthcarePersonnel
1320,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,email,gregoryjackson@healthcare.org,gregoryjaakson@healthcare.org,Email Typo,HealthcarePersonnel
1321,394,829aaeb0-9c85-412e-9d92-ce4d9fad50a7,personName,Gregory Jackson,G. Jackson,Name Abbreviations,Person


In [60]:
# Add the 'duplicate_id' column
# The 'row' column indicates the original index from df_person_log, which corresponds to Person1, Person2, etc.
# We add 1 to 'row' because the Person identifiers are 1-based (Person1, Person2, ...)
df_Person_HcP['duplicate_id'] = 'Person' + (df_Person_HcP['row'] + 1).astype(str)

# Display the DataFrame with the new column
print(df_Person_HcP.head())
print(df_Person_HcP.tail())

   row                           original_id     field_name  \
0    0  f0df3c2c-4e1a-4d07-97da-4ef05765c8e6  knowsLanguage   
1    0  f0df3c2c-4e1a-4d07-97da-4ef05765c8e6      birthDate   
2    0  f0df3c2c-4e1a-4d07-97da-4ef05765c8e6     personName   
3    0  f0df3c2c-4e1a-4d07-97da-4ef05765c8e6          email   
4    0  f0df3c2c-4e1a-4d07-97da-4ef05765c8e6      birthDate   

                  original_value                   varied_value  \
0                             nl                          Dutch   
1                     1965-05-23                     23-05-1965   
2                Ashley Williams                    A. Williams   
3  ashleywilliams@healthcare.org  ashledwilliams@healthcare.org   
4                     1965-05-23                                  

                   variation_type          entity_type duplicate_id  
0  Language Representation Change               Person      Person1  
1        Birth Date Format Change               Person      Person1  
2       

In [61]:
# Save the DataFrame to a CSV file
# The 'index=False' argument prevents pandas from writing the DataFrame index as a column in the CSV.
df_Person_HcP.to_csv('ground_truth_person.csv', index=False)

print("DataFrame saved to ground_truth_person.csv")

DataFrame saved to ground_truth_person.csv
