In [1]:
import os
from lxml import etree
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, CSV
import sparql_dataframe
import requests
import uuid
import time
from tqdm import tqdm
import os
import pickle
from collections import Counter

In [8]:
def extract_elements(xml_file):
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
    with open(xml_file, 'rb') as file:
        tree = etree.parse(file)
    
    elements = {
        'placeName': tree.xpath('/tei:TEI/tei:text/tei:body/tei:sp/tei:ab/tei:seg/tei:reg/tei:placeName', namespaces=namespaces),
        'persName': tree.xpath('/tei:TEI/tei:text/tei:body/tei:sp/tei:ab/tei:seg/tei:reg/tei:persName', namespaces=namespaces),
        'target': tree.xpath('//tei:ptr/@target', namespaces=namespaces),
        'author': tree.xpath('//tei:author/tei:persName/text()', namespaces=namespaces),
        'title': tree.xpath('//tei:title/text()', namespaces=namespaces),
        'pubPlace': tree.xpath('//tei:pubPlace/text()', namespaces=namespaces),
        'date': tree.xpath('//tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:bibl/tei:date/@when', namespaces=namespaces)
    }
    
    return tree, elements

In [9]:
def process_xml_folder(folder_path):
    data_place = []
    data_pers = []
    
    for filename in tqdm(os.listdir(folder_path), desc="Processing XML files"):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            tree, elements = extract_elements(file_path)
            
            placeNames = [el.text for el in elements['placeName']]
            persNames = [el.text for el in elements['persName']]
            target = elements['target'][0] if elements['target'] else None
            author = elements['author'][0] if elements['author'] else None
            title = elements['title'][0] if elements['title'] else None
            pubPlace = elements['pubPlace'][0] if elements['pubPlace'] else None
            date = elements['date'][0] if elements['date'] else None
            
            placeName_counts = Counter(placeNames)
            persName_counts = Counter(persNames)
            
            for placeName in set(placeNames):
                data_place.append({
                    'target': target,
                    'title': title,
                    'pubPlace': pubPlace,
                    'date': date,
                    'author': author,
                    'place': placeName,
                    'placeName_num': placeName_counts[placeName]
                })
                
            for persName in set(persNames):
                data_pers.append({
                    'target': target,
                    'title': title,
                    'pubPlace': pubPlace,
                    'date': date,
                    'author': author,
                    'person': persName,
                    'persName_num': persName_counts[persName]
                })
    
    return data_place, data_pers

In [10]:
def create_dataframes(data_place, data_pers):
    df_place = pd.DataFrame(data_place)
    df_pers = pd.DataFrame(data_pers)
    
    df_place = df_place.drop_duplicates().sort_values(by='placeName_num', ascending=False)
    df_pers = df_pers.drop_duplicates().sort_values(by='persName_num', ascending=False)
    
    return df_place, df_pers

In [11]:
folder_path = '/Users/nicola/Documents/Academia/Projects/TextEnt/Processing/NER_test'

data_place, data_pers = process_xml_folder(folder_path)
df_place, df_pers = create_dataframes(data_place, data_pers)

Processing XML files: 100%|██████████| 5/5 [00:00<00:00, 229.09it/s]


In [12]:
df_place["place"] = df_place["place"].str.lower()
df_place.head()

Unnamed: 0,target,title,pubPlace,date,author,place,placeName_num
22,http://catalogue.bnf.fr/ark:/12148/cb45675147s,"La mort de Caton, ou L'illustre desespéré trag...",Grenoble,1648,"Auger, Jacques",rome,61
34,http://catalogue.bnf.fr/ark:/12148/cb313084529,"L'escolier de Salamanque, ou Les généreux enne...",Grenoble,1655,"Scarron, Paul",tolède,9
51,http://catalogue.bnf.fr/ark:/12148/cb393250582,"Faramond ou le Triomphe des Héros, tragicomédie",Grenoble,1672,Lapoujade,cologne,6
21,http://catalogue.bnf.fr/ark:/12148/cb45675147s,"La mort de Caton, ou L'illustre desespéré trag...",Grenoble,1648,"Auger, Jacques",pharsale,6
30,http://catalogue.bnf.fr/ark:/12148/cb313084529,"L'escolier de Salamanque, ou Les généreux enne...",Grenoble,1655,"Scarron, Paul",salamanque,4


In [13]:
df_pers["person"] = df_pers["person"].str.lower()
df_pers.head()

Unnamed: 0,target,title,pubPlace,date,author,person,persName_num
157,http://catalogue.bnf.fr/ark:/12148/cb45675147s,"La mort de Caton, ou L'illustre desespéré trag...",Grenoble,1648,"Auger, Jacques",césar,85
300,http://catalogue.bnf.fr/ark:/12148/cb313084529,"L'escolier de Salamanque, ou Les généreux enne...",Grenoble,1655,"Scarron, Paul",crispin,77
502,http://catalogue.bnf.fr/ark:/12148/cb393250582,"Faramond ou le Triomphe des Héros, tragicomédie",Grenoble,1672,Lapoujade,balamir,68
5,http://catalogue.bnf.fr/ark:/12148/cb38760411p,L'essay des filles nouvelle comédie en trois a...,Grenoble,1699,,pascariel,66
475,http://catalogue.bnf.fr/ark:/12148/cb393250582,"Faramond ou le Triomphe des Héros, tragicomédie",Grenoble,1672,Lapoujade,faramond,63


In [14]:
top_two_values = df_place['placeName_num'].nlargest(2).unique()
df_place_filter = df_place[df_place['placeName_num'].isin(top_two_values)]

In [15]:
df_place_filter['uuid'] = df_place_filter['place'].apply(lambda x: str(uuid.uuid5(uuid.NAMESPACE_DNS, x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_place_filter['uuid'] = df_place_filter['place'].apply(lambda x: str(uuid.uuid5(uuid.NAMESPACE_DNS, x)))


In [16]:
df_place_filter

Unnamed: 0,target,title,pubPlace,date,author,place,placeName_num,uuid
22,http://catalogue.bnf.fr/ark:/12148/cb45675147s,"La mort de Caton, ou L'illustre desespéré trag...",Grenoble,1648,"Auger, Jacques",rome,61,cbef6bfb-ce53-50a5-a154-fb3e13aa618e
34,http://catalogue.bnf.fr/ark:/12148/cb313084529,"L'escolier de Salamanque, ou Les généreux enne...",Grenoble,1655,"Scarron, Paul",tolède,9,962852b2-97e0-558b-b021-2148ee1823c1


In [35]:
#df_place.to_csv('/Users/carboni/Documents/Academia/Projects/TextEnt/output/df_place.csv', index=False)
#df_pers.to_csv('/Users/carboni/Documents/Academia/Projects/TextEnt/output/df_pers.csv', index=False)

In [36]:
def save_progress(results, filename='sparql_results_temp.csv'):
    temp_df = pd.DataFrame(results)
    temp_df.to_csv(filename, index=False)

def load_progress(filename='sparql_results_temp.csv'):
    if os.path.exists(filename):
        temp_df = pd.read_csv(filename)
        return temp_df.to_dict('records')
    return []

def load_query_cache(filename='query_cache.pkl'):
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
    return {}

def save_query_cache(cache, filename='query_cache.pkl'):
    with open(filename, 'wb') as f:
        pickle.dump(cache, f)

In [37]:
def query_wikidata(place):
    if place in query_cache:
        return query_cache[place]

    query_template = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX schema: <http://schema.org/>

    SELECT ?a ?name ?coordinates ?typeLabel ?culture ?start_time ?end_time ?partOf ?sitelinks ?wikipedia_id ?abstract ?roman_atlas_id ?pleiades_id ?topostext_id ?myths_id ?poleis_id ?manto_id   
    WHERE {{
      {{
        SELECT ?a (MAX(?sitelinks) AS ?maxSitelinks)
        WHERE {{
          VALUES ?category {{ wd:Q6256 wd:Q82794 }}
          ?a (wdt:P31)/((wdt:P279)*) ?category.
          ?a rdfs:label ?name .
          ?a ^schema:about/wikibase:sitelinks ?sitelinks .

          FILTER (LANG(?name) = "fr") .
          FILTER REGEX(STR(?name), "^{place}$", "i") .
        }}
        GROUP BY ?a
        ORDER BY DESC(?maxSitelinks)
        LIMIT 1
      }}

      ?a rdfs:label ?name .
      ?a wdt:P31 ?type .
      ?a wdt:P625 ?coordinates . 
      ?a ^schema:about/wikibase:sitelinks ?sitelinks .
     
      ?type rdfs:label ?typeLabel .
      ?wikipedia_id schema:about ?a.
      ?wikipedia_id schema:inLanguage "en" .
      ?wikipedia_id schema:description ?abstract .
    OPTIONAL {{
         ?a wdt:P2596 ?culture . 
         ?culture wdt:P580 ?start_time .
         ?culture wdt:P582 ?end_time .
      }} 
      
    OPTIONAL {{
        ?a wdt:P1584 ?pleiades_id
      }}
    OPTIONAL {{
        ?a wdt:P8068 ?topostext_id . 
      }}
    OPTIONAL {{
        ?a wdt:P361 ?partOf
      }}
    OPTIONAL {{
        ?a wdt:P1936 ?roman_atlas_id . 
      }}  
    OPTIONAL {{
        ?a wdt:P12402 ?myths_id . 
      }}  
    OPTIONAL {{
        ?a wdt:P8137 ?poleis_id .
      }}
    OPTIONAL {{
        ?a wdt:P9736 ?manto_id .
      }} 

      FILTER (LANG(?name) = "en") .
      FILTER (LANG(?typeLabel) = "en") .
    }}
    ORDER BY DESC(?sitelinks)
    """
    
    query = query_template.format(place=place)
    url = 'https://qlever.cs.uni-freiburg.de/api/wikidata'
    #url = 'http://10.194.68.72:7001' #internal unige
    response = requests.get(url, params={'query': query, 'output': 'json'})
    
    if response.status_code == 200:
        raw_results = response.json().get('results', {}).get('bindings', [])
        # Process results to ensure all values are strings
        processed_results = []
        for result in raw_results:
            processed_result = {key: str(value.get('value', '')) for key, value in result.items()}
            processed_results.append(processed_result)
        query_cache[place] = processed_results
        return processed_results
    else:
        query_cache[place] = None  # Cache failed queries as well
        return None

In [40]:
# Initialize or load query cache
query_cache = load_query_cache()

# Load existing results if any
results = load_progress()

# Get the set of already processed places
processed_places = set([entry['place'] for entry in results])

# Extract unique places from your DataFrame
unique_places_df = df_place_filter.drop_duplicates(subset='place')

# Initialize a counter for saving progress periodically
counter = 0
save_every_n = 10  # Save progress every N iterations

# Iterate over unique places
for index, row in tqdm(unique_places_df.iterrows(), total=len(unique_places_df), desc="Querying QLever"):
    place = row['place']
    target = row['target']
    uuid = row['uuid']
    
    if place in processed_places:
        continue
    
    sparql_results = query_wikidata(place)
    
    if sparql_results:
        for result in sparql_results:
            results.append({
                'place': place,
                'target': target,
                'uuid': uuid,
                'wikidata_id': result.get('a', ''),
                'name': result.get('name', ''),
                'coordinates': result.get('coordinates', ''),
                'typeLabel': result.get('typeLabel', ''),
                'culture': result.get('culture', ''),
                'start_time': result.get('start_time', ''),
                'end_time': result.get('end_time', ''),
                'partOf': result.get('partOf', ''),
                'sitelinks': result.get('sitelinks', ''),
                'wikipedia_id': result.get('wikipedia_id', ''),
                'abstract': result.get('abstract', ''),
                'roman_atlas_id': result.get('roman_atlas_id', ''),
                'pleiades_id': result.get('pleiades_id', ''),
                'topostext_id': result.get('topostext_id', ''),
                'myths_id': result.get('myths_id', ''),
                'poleis_id': result.get('poleis_id', ''),
                'manto_id': result.get('manto_id', '')
            })
    else:
        # Handle cases where there is no result
        results.append({
            'place': place,
            'uuid': uuid,
            'target': target,
            'wikidata_id': '',
            'name': '',
            'coordinates': '',
            'typeLabel': '',
            'culture': '',
            'start_time': '',
            'end_time': '',
            'partOf': '',
            'sitelinks': '',
            'wikipedia_id': '',
            'abstract': '',
            'roman_atlas_id': '',
            'pleiades_id': '',
            'topostext_id': '',
            'myths_id': '',
            'poleis_id': '',
            'manto_id': ''
        })
    
    processed_places.add(place)
    counter += 1
    
    # Save progress and cache every N iterations
    if counter % save_every_n == 0:
        save_progress(results)
        save_query_cache(query_cache)
        # Optionally, print a message
        # print(f"Saved progress after processing {counter} places.")
    
    # Optional: Remove or adjust sleep time if necessary
     time.sleep(1)

# After processing all places, save final progress and cache
save_progress(results)
save_query_cache(query_cache)

# Convert the results to a new DataFrame
new_df = pd.DataFrame(results)

Querying QLever: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]


In [41]:
new_df

Unnamed: 0,place,target,uuid,wikidata_id,name,coordinates,typeLabel,culture,start_time,end_time,partOf,sitelinks,wikipedia_id,abstract,roman_atlas_id,pleiades_id,topostext_id,myths_id,poleis_id,manto_id
0,tolède,http://catalogue.bnf.fr/ark:/12148/cb313084529,962852b2-97e0-558b-b021-2148ee1823c1,http://www.wikidata.org/entity/Q5836,Toledo,POINT(-4.033333 39.866667),municipality of Spain,,,,http://www.wikidata.org/entity/Q19942150,126,"https://en.wikipedia.org/wiki/Toledo,_Spain","Toledo is a city and municipality of Spain, th...",,266066,,,,
1,salamanque,http://catalogue.bnf.fr/ark:/12148/cb313084529,7e1fedfe-f21f-52bf-a07f-23a331c32c13,http://www.wikidata.org/entity/Q15695,Salamanca,POINT(-5.664167 40.965000),municipality of Spain,,,,,113,https://en.wikipedia.org/wiki/Salamanca,"Salamanca is a municipality and city in Spain,...",16865.0,236642,410000USal,,,


In [None]:
new_df

In [None]:
new_df.to_csv('xml_sparql_results.csv', index=False)

# Filter out continent

In [None]:
continent_entries = new_df[new_df['typeLabel'].str.contains('continent', case=False, na=False)]['entry'].unique()
filtered_df = new_df[~new_df['entry'].isin(continent_entries)]
filtered_df.head(10)

# External Authorities

In [None]:
def convert_to_iso(date):
    if date is not None:
        if date < 0:
            return f"{abs(date):04d}-01-01 BCE"
        else:
            return f"{date:04d}-01-01"
    return None


## Roman Atlas Authority

Extraction of dates linked with records in Roman Atlas

In [None]:
unique_roman_atlas_ids = new_df['roman_atlas_id'].dropna().unique()

In [None]:
# Function to fetch information from Roman Atlas API
def fetch_roman_atlas_info(roman_atlas_id):
    url = f"https://imperium.ahlfeldt.se/api/geojson.php?id={roman_atlas_id}"
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            data = response.json()
            features = data.get('features', [])
            if features:
                properties = features[0].get('properties', {})
                maxdate = properties.get('maxdate')
                mindate = properties.get('mindate')
                maxdate_iso = convert_to_iso(maxdate)
                mindate_iso = convert_to_iso(mindate)
                return {
                    'id': roman_atlas_id,
                    'maxdate': maxdate_iso,
                    'mindate': mindate_iso
                }
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Error fetching Roman Atlas info for ID {roman_atlas_id}: {e}")
            time.sleep(1)  # Wait 1 second before retrying
    return None

In [None]:
roman_atlas_results = []

In [None]:
# Fetch Roman Atlas info with a progress bar
for roman_atlas_id in tqdm(unique_roman_atlas_ids, desc="Fetching Roman Atlas Info"):
    info = fetch_roman_atlas_info(roman_atlas_id)
    if info:
        roman_atlas_results.append(info)
    time.sleep(1)  # Adding delay to avoid overwhelming the API

In [None]:
roman_atlas_df = pd.DataFrame(roman_atlas_results)

In [None]:
# Save the new dataframe to a CSV file
roman_atlas_df.to_csv('roman_atlas_authority.csv', index=False)

# Display the new dataframe
roman_atlas_df.head()

## Pleiadis Authority

In [None]:
unique_pleiades_ids = new_df['pleiades_id'].dropna().unique()

In [None]:
def fetch_pleiades_info(pleiades_id):
    url = f"https://pleiades.stoa.org/places/{pleiades_id}/json"
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            data = response.json()
            locations = data.get('locations', [])
            if locations:
                location = locations[0]
                start = location.get('start')
                end = location.get('end')
                start_iso = convert_to_iso(start)
                end_iso = convert_to_iso(end)
                return {
                    'id': pleiades_id,
                    'start': start_iso,
                    'end': end_iso
                }
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Error fetching Pleiades info for ID {pleiades_id}: {e}")
            time.sleep(1)  # Wait 1 second before retrying
    return None

In [None]:
pleiades_results = []

In [None]:
# Fetch Pleiades info with a progress bar
for pleiades_id in tqdm(unique_pleiades_ids, desc="Fetching Pleiades Info"):
    info = fetch_pleiades_info(pleiades_id)
    if info:
        pleiades_results.append(info)
    time.sleep(1)  # Adding delay to avoid overwhelming the API

In [None]:
# Convert results to a DataFrame
pleiades_df = pd.DataFrame(pleiades_results)

In [None]:
# Save the new dataframe to a CSV file
pleiades_df.to_csv('pleiades_info.csv', index=False)

# Display the new dataframe
pleiades_df.head()