# Data conversion from JSON to Graph

Install libraries

In [None]:
!pip install wikidata
!pip install nameparser
!pip install rdflib
!pip install tqdm

Import libraries

In [None]:
import json
import csv
from nameparser import HumanName
import requests
from wikidata.client import Client
from tqdm import tqdm
import unicodedata
import spacy
import requests
import re
from rdflib import URIRef, ConjunctiveGraph, Literal, Namespace
from rdflib.namespace import RDF, XSD, RDFS, OWL

# Data cleaning functions

Define a function to remove appellative from people labels


In [None]:
def remove_appellatives(name):
    parsed_name = HumanName(name)
    parsed_name.title = ''  # Remove the title
    return str(parsed_name)

In [None]:
def from_string_to_URI(input_string):
    # Normalize the string to decomposed form
    normalized_string = unicodedata.normalize('NFD', input_string)
    # Remove diacritics
    without_accents = ''.join(c for c in normalized_string if not unicodedata.combining(c))
    without_spaces = without_accents.replace(" ", "")
    punctuation_chars = '''£$!()-[]{};:'"\,<>./?@#$%^&*_~'''
    without_punctuation = ''.join(c for c in without_spaces if c not in punctuation_chars)

    return without_punctuation

NER

In [None]:
def extract_people_org(text, nlp):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
      if ent.label_ in ['ORG', 'PERSON']:
          entities.append((ent.text, ent.label_))

    return entities

In [None]:
def convert_name_format(full_name):
    names = full_name.split()

    if len(names) >= 2:
        formatted_name = f"{names[-1]}, {' '.join(names[:-1])}"
        return formatted_name
    else:
        return full_name

# Entity linking functions

In [None]:
def get_viaf_id_from_label(label):
    # VIAF endpoint for searching by label
    viaf_endpoint = "http://www.viaf.org/viaf/AutoSuggest"

    # Parameters for the request
    params = {
        'query': label,
        'sortKeys': 'holdingscount',
        'maximumRecords': 1
    }

    # Make the request to VIAF API
    response = requests.get(viaf_endpoint, params=params)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        if 'result' in data and data['result']:
            # Extract VIAF ID from the first result
            viaf_id = data['result'][0]['viafid']
            return viaf_id
        else:
            print("No results found.")
    else:
        print(f"Error: {response.status_code}")

print(get_viaf_id_from_label('Milan Šufflay'))

37725281


Get controlled label from VIAF

In [None]:
def is_latin_string(input_str):
    latin_alphabet = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
    if any(char in latin_alphabet for char in input_str) == True:
      return True

def get_controlled_labels(viaf_id):
    # VIAF API endpoint for personal authority data
    viaf_api_url = f'https://www.viaf.org/viaf/{viaf_id}/viaf.json'

    try:
        response = requests.get(viaf_api_url)

        if response.status_code == 200:
            data = response.json()

            controlled_labels = data['mainHeadings']['data']

            if isinstance(controlled_labels, list):
              for el in controlled_labels:
                if is_latin_string(el['text']) == True:
                  return el['text']
            if isinstance(controlled_labels, dict):
              return controlled_labels['text']
        else:
            print(f"{viaf_id}, Error: Unable to retrieve labels from VIAF. Status Code: {response.status_code}")
    except Exception as e:
        print(f"{viaf_id}, Error: {e}")

Define a function to retrieve wikidata IDs from labels

In [None]:
def get_wikidata_id_from_label(label):
    base_url = "https://www.wikidata.org/w/api.php"

    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "search": label
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    # Check if there are any results
    if data.get("search"):
        # Return the ID of the first result
        return data["search"][0]["id"]
    else:
        return None

In [None]:
def get_wikidata_label_from_id(wikidata_id):
    base_url = "https://www.wikidata.org/w/api.php"

    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidata_id,
        "languages": "en"
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    # Check if the ID is valid and has labels
    entity = data.get("entities", {}).get(wikidata_id)
    if entity and entity.get("labels"):
        # Return the label in the specified language (e.g., English)
        return entity["labels"]["en"]["value"]
    else:
        return None

Define a function to rencile textual entities to Wikidata entities


In [None]:
def get_entity_metadata(entity_id, prop):

    client = Client()
    entity = client.get(entity_id, load=True)

    value = None

    print(entity)

    if prop in entity.data['claims']:
        value = entity.data['claims'][prop][0]['mainsnak']['datavalue']['value']
    return value

# Graph generation

In [None]:
diz = {}

nlp = spacy.load("en_core_web_sm")

i, ni = 0,0
for el in tqdm(llm_results):
  if 'claims_best_response' in el:
    for claim in el['claims_best_response']:
      claim_id = el['Page ID']+str(el['claims_best_response'].index(claim))

      claimer = remove_appellatives(claim['claimer_entity']).strip()
      ner_output = extract_people_org(claimer, nlp)

      # FIRST RECONCILIATION, IF THE STRING IS ALREADY CLEANED
      if len(ner_output) == 1:
        viaf_id =get_viaf_id_from_label(claimer)
        wiki_id = get_wikidata_id_from_label(claimer)

        temp1 = []
        if wiki_id != None:
          temp1.append(wiki_id)
        if viaf_id != None:
          temp1.append(viaf_id)
        if len(temp1) > 0:
          claim.update({'claimer_data':{'label':claimer, 'ids':temp1}})

        # SECOND RECONCILIATION, IF THE STRING IS NOT CLEANED, BUT TRIES WITH THE NER EXTRACTED LABEL
        if wiki_id == None and viaf_id == None:
          # perform NER
          ner_claimer = remove_appellatives(ner_output[0][0]).strip()
          viaf_id =get_viaf_id_from_label(ner_claimer)
          wiki_id = get_wikidata_id_from_label(ner_claimer)
          if wiki_id != None and viaf_id != None:
            temp1 = []
          if wiki_id != None:
            temp1.append(wiki_id)
          if viaf_id != None:
            temp1.append(viaf_id)
          if len(temp1) > 0:
            claim.update({'claimer_data':{'label':ner_claimer, 'ids':temp1}})

      if len(ner_output) > 1:
        temp = []
        for ner in ner_output:
          claimer =  remove_appellatives(ner[0][0]).strip()
          wiki_id = get_wikidata_id_from_label(claimer)
          viaf_id =get_viaf_id_from_label(claimer)
          if wiki_id != None and viaf_id !=None:
            if wiki_id != None:
              temp1 = []
            if wiki_id != None:
              temp1.append(wiki_id)
            if viaf_id != None:
              temp1.append(viaf_id)
            if len(temp1) > 0:
              temp.append({'label':claimer, 'ids':temp1})
        claim.update({'claimer_data':temp})

print(i,ni)

with open('reconciled_3_new_forgery_claims_full.json', 'w') as json_file:
    json.dump(llm_results, json_file, indent=4)

In [None]:
for el in llm_results:

  if 'claims_best_response' in el:
    for claim in el['claims_best_response']:
        if 'claimer_data' not in claim:
          print(claim["claimer_entity"])
          var = input('Do you recognise this person?')
          if var == 'yes':
            claim.update({'claimer_data': {"label": input('insert label'), "ids": [input('wiki id'), input('viaf id')]}})

In [None]:
with open('reconciled_3_new_forgery_claims_full.json', 'w') as json_file:
    json.dump(llm_results, json_file, indent=4)

In [None]:
def parse_century_string(century_str):
    century_number = int(''.join(filter(str.isdigit, century_str)))
    start_year = (century_number - 1) * 100 + 1
    end_year = century_number * 100

    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)

    return start_date, end_date

In [None]:
from dateutil import parser
from datetime import datetime, timedelta

def timespan_handling(date_string, nlp):

  doc = nlp(date_string)
  entities = []
  for ent in doc.ents:
    if ent.label_ in ['DATE']:
        entities.append((ent.text, ent.label_))

    for date_string in entities:
      try:
          parsed_date = parser.parse(date_string[0])
          print(f"Original: {date_string}, Parsed: {parsed_date}")
          input = input('Is the date correct, Y or N')
          if input == 'Y':
            return parsed_date
          else:
            temp_date = input('Print timespan begin and end')
      except ValueError:
          print(f"Unable to parse: {date_string}")
          temp_date = input('Print timespan begin and end')

  # handle centuries
  else:
    try:
      parsed_date = parse_century_string(date_string)
      input = input('Is the date correct, Y or N')
      if input == 'Y':
        return parsed_date
      else:
        temp_date = input('Print timespan begin and end')
      return parsed_date
    except:
      print(f"Unable to parse century: {date_string}")
      temp_date = input('Print timespan begin and end')

In [None]:
import requests

def geonames_api_request(place_name):
    base_url = "http://api.geonames.org/search?"
    params = {
        'q': place_name,
        'username': 'demo',  # Replace with your GeoNames username
        'type': 'json',  # You can change the response format if needed
        'fuzzy':0.8
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        print(data)
        return data
    else:
        print(f"Error: {response.status_code}")
        return None


In [None]:
g = ConjunctiveGraph()

FORGONT = Namespace("http://www.example.org/")
g.bind("forgont", FORGONT)

VIAF = Namespace("https://viaf.org/viaf/")
g.bind("viaf", VIAF)

WD = Namespace("http://www.wikidata.org/entity/")
g.bind("wd", WD)

DCT = Namespace("http://purl.org/dc/elements/1.1/")
g.bind("dct", DCT)

for el in llm_results:

  #document URI
  if "Document metadata" in el and len(el["Document metadata"][0]) > 0:
    document_ID = from_string_to_URI(el["Document metadata"][0]["document_title"].strip())
  else:
      document_ID = from_string_to_URI(el['Page URL'].replace('https://en.wikipedia.org/wiki/', '').replace('\n', '').strip())
  document_URI = URIRef(FORGONT+document_ID)
  alleged_context = URIRef(FORGONT+document_ID+'_doc_claim')

  if "Document metadata" in el:
    if len(el["Document metadata"][0]) > 0:
      g.add((document_URI, DCT.title, Literal(el["Document metadata"][0]["document_title"], datatype=XSD.string)))
      g.add((document_URI, RDF.type, URIRef(FORGONT+from_string_to_URI(el["Document metadata"][0]["document_support"])), alleged_context))
      #timespan = timespan_handling(el["Document metadata"][0]['document_alleged_date'], nlp)
      # g.add((document_URI, DCT.date, Literal(el["Document metadata"][0]['document_alleged_date'], datatype=XSD.string), alleged_context))
      #g.add((document_URI, DCT.place, Literal(el["Document metadata"][0]['document_alleged_place'], datatype=XSD.string), alleged_context))
      geo_id = geonames_api_request(el["Document metadata"][0]['document_alleged_place'])
      g.add((document_URI, DCT.creator, Literal(el["Document metadata"][0]['document_alleged_creator'], datatype=XSD.string), alleged_context))

  if 'claims_best_response' in el:
    for claim in el['claims_best_response']:

      # CLAIM CONTEXTUAL INFORMATION
      if 'claimer_data' in claim:

        # IF ONLY ONE AUTHOR IS DETECTED IN THE CLAIM
        if isinstance(claim.get('claimer_data'), dict):
          if len(claim['claimer_data']["ids"]) > 0:
            wiki = [id_value for id_value in claim['claimer_data']["ids"] if id_value.startswith("Q")]
            viaf = [id_value for id_value in claim['claimer_data']["ids"] if not id_value.startswith("Q")]

            # IF VIAF ID FOR CLAIM AUHTOR, ADD THE CLAIM - all claims made by the same person/org on the same document are grouped in the same Named Graph
            if len(viaf) > 0:
              context = URIRef(FORGONT+el['Page URL'].replace('https://en.wikipedia.org/wiki/','')+from_string_to_URI(viaf[0]))
              claim_author_URI = URIRef(FORGONT+from_string_to_URI(viaf[0]))
              g.add((claim_author_URI, RDF.type, FORGONT.Person))
              g.add((claim_author_URI, RDFS.label, Literal(get_controlled_labels(viaf[0]), datatype=XSD.string)))
              g.add((claim_author_URI, OWL.sameAs, URIRef(VIAF+viaf[0])))
              g.add((context, FORGONT.author, claim_author_URI))
              g.add((context, RDFS.comment, Literal(claim['claim_text'], datatype=XSD.string)))
              # claim's opinion
              claim_opinion_URI = URIRef(FORGONT+claim['claim_opinion'])
              g.add((document_URI, RDF.type, claim_opinion_URI, context))
              if len(wiki) > 0:
                g.add((claim_author_URI, OWL.sameAs, URIRef(WD+wiki[0])))

            # IF NO VIAF BUT A WIKIDATA ID, ADD THE CLAIM - all claims made by the same person/org on the same document are grouped in the same Named Graph
            elif len(viaf) == 0 and len(wiki) > 0:
              context = URIRef(FORGONT+el['Page URL'].replace('https://en.wikipedia.org/wiki/','')+from_string_to_URI(wiki[0]))
              claim_author_URI = URIRef(FORGONT+from_string_to_URI(wiki[0]))
              g.add((claim_author_URI, RDF.type, FORGONT.Person))
              g.add((claim_author_URI, RDFS.label, Literal(get_wikidata_label_from_id(wiki[0]), datatype=XSD.string)))
              g.add((claim_author_URI, OWL.sameAs, URIRef(WD+wiki[0])))
              g.add((context, FORGONT.author, claim_author_URI))
              g.add((context, RDFS.comment, Literal(claim['claim_text'], datatype=XSD.string)))
              # claim's opinion
              claim_opinion_URI = URIRef(FORGONT+claim['claim_opinion'])
              g.add((document_URI, RDF.type, claim_opinion_URI, context))
          else:
              # TO DO
              pass

        # IF MULTILPLE AUHTORS ARE DETECTED IN THE CLAIM
        if isinstance(claim.get('claimer_data'), list):
          for author in claim['claimer_data']:
            wiki = [id_value for id_value in author["ids"] if id_value.startswith("Q")]
            viaf = [id_value for id_value in author["ids"] if not id_value.startswith("Q")]

            # IF VIAF ID FOR CLAIM AUHTOR, ADD THE CLAIM - all claims made by the same person/org on the same document are grouped in the same Named Graph
            if len(viaf) > 0:
              context = URIRef(FORGONT+el['Page URL'].replace('https://en.wikipedia.org/wiki/','').strip()+from_string_to_URI(viaf[0]))
              claim_author_URI = URIRef(FORGONT+viaf[0].strip())
              g.add((claim_author_URI, RDF.type, FORGONT.Person))
              g.add((claim_author_URI, RDFS.label, Literal(get_controlled_labels(viaf[0]), datatype=XSD.string)))
              g.add((claim_author_URI, OWL.sameAs, URIRef(VIAF+viaf[0])))
              g.add((context, FORGONT.author, claim_author_URI))
              # claim's opinion
              claim_opinion_URI = URIRef(FORGONT+claim['claim_opinion'])
              g.add((document_URI, RDF.type, claim_opinion_URI, context))
              g.add((context, RDFS.comment, Literal(claim['claim_text'], datatype=XSD.string)))
              if len(wiki) > 0:
                g.add((claim_author_URI, OWL.sameAs, URIRef(WD+wiki[0])))

            # IF NO VIAF BUT A WIKIDATA ID, ADD THE CLAIM - all claims made by the same person/org on the same document are grouped in the same Named Graph
            elif len(viaf) == 0 and len(wiki) > 0:
              context = URIRef(FORGONT+el['Page URL'].replace('https://en.wikipedia.org/wiki/','').strip()+from_string_to_URI(viaf[0]))
              claim_author_URI = URIRef(FORGONT+from_string_to_URI(wiki[0]))
              g.add((claim_author_URI, RDF.type, FORGONT.Person))
              g.add((claim_author_URI, RDFS.label, Literal(get_wikidata_label_from_id(wiki[0]), datatype=XSD.string)))
              g.add((claim_author_URI, OWL.sameAs, URIRef(WD+wiki[0])))
              g.add((context, FORGONT.author, claim_author_URI))
              # claim's opinion
              claim_opinion_URI = URIRef(FORGONT+claim['claim_opinion'])
              g.add((document_URI, RDF.type, claim_opinion_URI, context))
              g.add((context, RDFS.comment, Literal(claim['claim_text'], datatype=XSD.string)))

g.serialize(destination='prova.trig', format="trig")

## Testing

In [None]:
# Example SPARQL query
query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX forgont: <http://www.example.org/>

    SELECT DISTINCT ?subject (COUNT(distinct ?g) as ?n)
    WHERE {

    VALUES ?class { forgont:Authentic forgont:Suspicious forgont:Forgery }
        GRAPH ?g {?subject a ?class}
    }

    GROUP BY ?subject ORDER BY DESC (?n)
"""

# Execute the query
results = g.query(query)

# Print the results
for row in results:
    print("Subject:", row.subject)
    print("n:", row.n)
    print("\n")

Subject: http://www.example.org/HistoriaAugusta
n: 14


Subject: http://www.example.org/IrelandShakespeareManuscripts
n: 13


Subject: http://www.example.org/TheDvurKraloveandZelenaHoramanuscripts
n: 12


Subject: http://www.example.org/DonationofConstantine
n: 12


Subject: http://www.example.org/CharterofDukeTrpimir
n: 10


Subject: http://www.example.org/PactaConventaCroatia
n: 9


Subject: http://www.example.org/TheCorrespondenceofPaulandSeneca
n: 9


Subject: http://www.example.org/AlthochdeutschesSchlummerlied
n: 8


Subject: http://www.example.org/DeSituBritanniae
n: 8


Subject: http://www.example.org/TheSalamanderLetter
n: 7


Subject: http://www.example.org/AshtinameofMuhammad
n: 6


Subject: http://www.example.org/CodeofRajahKalantiaw
n: 6


Subject: http://www.example.org/MussoliniDiaries
n: 6


Subject: http://www.example.org/TheFranklinProphecy
n: 6


Subject: http://www.example.org/OathofaFreeman
n: 6


Subject: http://www.example.org/TheReportfromIronMountain
n: 5


Sub