In [1]:
from google.colab import drive

drive.mount('/content/drive')

%cd drive/MyDrive/CGDC/NLI

Mounted at /content/drive
/content/drive/MyDrive/CGDC/NLI


In [7]:
relations_to_templates = {
    "date_of_death": "died on",
    "significant_person": {
        "Person": {"Person": "communicated with"},
        "Organisation": {"Person": "is made up of"}
    },
    "partnership_with": "collaborated with",
    "place_of_birth": "was born in",
    "date_of_birth": "was born on",
    "spouse": "married to",
    "location": "happened in",
    "operating_area": "operated in",
    "participant_in": "participated in",
    "occupation": "worked as a",
    "member_of": "was a member of",
    "location_in": "is located in",
    "notable_work": {
        "Person": {
            "Location": "built",
            "Misc": "created"
        },
        "Organisation": {
            "Misc": "created"
        }
    },
    "inception": "was created on",
    "point_in_time": {
        "Misc": {"Date": "happened on"},
        "Location": {"Date": "as of"}
    },
    "work_location": "worked in",
    "residence": "lived in",
    "affiliation": "was connected or linked to",
    "employer": "worked for",
    "depicts": "shows"
}

In [8]:
import csv

with open('nli_util/mapping_of_relation_types.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    header = next(reader)
    rows = []
    for row in reader:
        if row[0] == '':
            break
        rows.append(row)

entity_types = set()
for row in rows:
    entity_types.add(row[0])
    entity_types.add(row[3])


possible_relations = {}
for relation_1 in entity_types:
    possible_relations[relation_1] = {}
    for relation_2 in entity_types:
        possible_relations[relation_1][relation_2] = []

for row in rows:
    head_entity_type = row[0]
    tail_entity_type = row[3]
    pair_relation = row[5].replace(" ", "_")
    if pair_relation[-1] == "_":
      pair_relation = pair_relation[:-1]
    if pair_relation == 'located_in/on_physical_feature':
      pair_relation = 'location_in'

    possible_relations[head_entity_type][tail_entity_type].append(pair_relation)
    relations.add(pair_relation)

for relation_1 in entity_types:
  for relation_2 in entity_types:
    pass
    #print(f"For {relation_1} -> {relation_2}")
    #print(possible_relations[relation_1][relation_2])
    #print()

In [9]:
f = open("nli_util/relation_types_priorities.txt", "r")
relation_types_priorities = {}
for line in f:
  relation_types, *relations = line.split()
  if relation_types == "Person-Misc" or relation_types == "Misc-Location":
    continue
  else:
    relations_list = []
    for relation in relations:
      if relation != ">":
        relations_list.append(relation)
    relation_types_priorities[relation_types] = relations_list

#Experiments

In [10]:
!pip install transformers
!pip install sentence_transformers

from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForSequenceClassification, BartTokenizer, pipeline
from sentence_transformers import CrossEncoder
import json
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [11]:
def softmax(x):
    return(np.exp(x)/np.exp(x).sum())

def predict(model_name, model, tokenizer, premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, obj_for_current_sample):
  if model_name == "deberta":
    most_likely_label = ("", 0)

    labels = ["contradiction", "entailment", "neutral"]
    obj_for_current_sample["hypotheses"] = []

    scores = model.predict([(premise, curr_hypothesis) for curr_hypothesis in hypothesis_list])
    for idx, score in enumerate(scores):
      # (contradiction, entailment, neutral)
      probs = softmax(score)
      if probs[1] > probs[0] and probs[1] > probs[2]:
        if probs[1] > most_likely_label[1]:
          most_likely_label = (current_relations[idx], probs[1])
      obj_for_current_sample["hypotheses"].append({"hypothesis": hypothesis_list[idx], "probabilities": {labels[i] : float(probs[i]) for i in range(len(labels))}})

    final_prediction = most_likely_label[0] if most_likely_label[1] > 0.40 else "NOTA"
    obj_for_current_sample["prediction"] = final_prediction

    return final_prediction

  elif model_name == "bart":
    labels = ["contradiction", "neutral", "entailment"]
    most_likely_label = ("", 0)
    obj_for_current_sample["hypotheses"] = []

    for idx, hypothesis in enumerate(hypothesis_list):
      tokens = tokenizer(premise, hypothesis, return_tensors="pt")
      outputs = model(**tokens)
      probs = outputs.logits.softmax(dim=1)
      predicted_label = labels[int(probs.argmax(axis=1))]
      if predicted_label == "entailment":
        if probs[0][2] > most_likely_label[1]:
          most_likely_label = (current_relations[idx], probs[0][2])
      obj_for_current_sample["hypotheses"].append({"hypothesis": hypothesis, "probabilities": {labels[i] : float(probs[0][i]) for i in range(len(labels))}})

    final_prediction = most_likely_label[0] if most_likely_label[1] > 0.40 else "NOTA"
    obj_for_current_sample["prediction"] = final_prediction

    return final_prediction
  elif model_name == "t5-xxl":
    input_list = [f"premise: {premise} hypothesis: {curr_hypothesis}" for curr_hypothesis in hypothesis_list]

    # Tokenize and generate predictions
    inputs = tokenizer(input_list, return_tensors="pt", padding=True)
    output_sequences = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        do_sample=False,  # disable sampling to test if batching affects output
    )

    predictions = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    number_of_positive_predictions = predictions.count("1")
    if number_of_positive_predictions == 0:
      final_prediction = "NOTA"
    elif number_of_positive_predictions == 1:
      final_prediction = current_relations[predictions.index("1")]
    else:
      candidate_labels = [current_relations[i] for i in range(len(current_relations)) if predictions[i] == "1"]
      final_prediction = choose_final_label(head_type, tail_type, candidate_labels)

    obj_for_current_sample["hypotheses"] = [{"hypothesis": hypothesis_list[idx], "prediction": predictions[idx]} for idx in range(len(hypothesis_list))]
    obj_for_current_sample["prediction"] = final_prediction
    return final_prediction
  else:
    print("Wrong name for model supplied.")
    return


def get_current_templates(current_relations, head_type, tail_type):
  current_templates = []
  for relation in current_relations:
    if isinstance(relations_to_templates[relation], dict):
      current_templates.append(relations_to_templates[relation][head_type][tail_type])
    else:
      current_templates.append(relations_to_templates[relation])
  return current_templates

def get_current_model(model_name):
  if model_name == "deberta":
    tokenizer = None
    model = CrossEncoder('cross-encoder/nli-deberta-v3-large')
  elif model_name == "bart":
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")
    model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
  elif model_name == "t5-xxl":
    tokenizer = T5Tokenizer.from_pretrained("google/t5_xxl_true_nli_mixture")
    model = T5ForConditionalGeneration.from_pretrained("google/t5_xxl_true_nli_mixture")
  else:
    print("Incorrect name for model.")
    tokenizer = None
    model = None
  return (tokenizer, model)


def macro_weighted_averages(data):
  labels_of_interest_sample_counter = sum([data["results"]["label_wise"][key]["samples_per_class"] if key != "NOTA" else 0 for key in data["results"]["label_wise"].keys()])
  nota_sample_counter = data["results"]["label_wise"]["NOTA"]["samples_per_class"]

  weighted_macro_f1_score_labels_of_interest = sum([data["results"]["label_wise"][key]["samples_per_class"] * data["results"]["label_wise"][key]["f1_score"] if key != "NOTA" else 0 for key in data["results"]["label_wise"].keys()]) / labels_of_interest_sample_counter
  weighted_macro_recall_labels_of_interest = sum([data["results"]["label_wise"][key]["samples_per_class"] * data["results"]["label_wise"][key]["recall"] if key != "NOTA" else 0 for key in data["results"]["label_wise"].keys()]) / labels_of_interest_sample_counter
  weighted_macro_precision_score_labels_of_interest = sum([data["results"]["label_wise"][key]["samples_per_class"] * data["results"]["label_wise"][key]["precision"] if key != "NOTA" else 0 for key in data["results"]["label_wise"].keys()]) / labels_of_interest_sample_counter
  weighted_macro_f1_score_nota = data["results"]["label_wise"]["NOTA"]["f1_score"]
  weighted_macro_recall_score_nota = data["results"]["label_wise"]["NOTA"]["recall"]
  weighted_macro_precision_score_nota = data["results"]["label_wise"]["NOTA"]["precision"]

  print(f"Non-NOTA Precision: {weighted_macro_precision_score_labels_of_interest}")
  print(f"Non-NOTA Recall: {weighted_macro_recall_labels_of_interest}")
  print(f"Non-NOTA F1-score: {weighted_macro_f1_score_labels_of_interest}")
  print()
  print(f"NOTA Precision: {weighted_macro_precision_score_nota}")
  print(f"NOTA Recall: {weighted_macro_recall_score_nota}")
  print(f"NOTA F1-score: {weighted_macro_f1_score_nota}")
  print()
  print()

  return {
      "non_NOTA_macro_weighted_precision": weighted_macro_precision_score_labels_of_interest,
      "non_NOTA_macro_weighted_recall": weighted_macro_recall_labels_of_interest,
      "non_NOTA_macro_weighted_f1_score": weighted_macro_f1_score_labels_of_interest,
      "NOTA_macro_weighted_precision": weighted_macro_precision_score_nota,
      "NOTA_macro_weighted_recall": weighted_macro_recall_score_nota,
      "NOTA_macro_weighted_f1_score": weighted_macro_f1_score_nota,
}

def choose_final_label(head_type, tail_type, candidate_labels):

  relation_type = f"{head_type}-{tail_type}"
  if relation_type == "Person-Misc":
    return "notable_work"
  elif relation_type == "Misc-Location":
    return "location"
  else:
    labels_indexes = [relation_types_priorities[relation_type].index(label) for label in candidate_labels]
    index_min = min(range(len(labels_indexes)), key=labels_indexes.__getitem__)
    final_prediction = candidate_labels[index_min]
    return final_prediction

def run_experiment(model_name, dataset_path):
  (tokenizer, model) = get_current_model(model_name)

  with open(dataset_path) as infile:
    data = json.load(infile)

  samples_counter = 0
  green_samples_counter = 0
  red_samples_counter = 0
  nota_green_samples_counter = 0
  nota_red_samples_counter = 0
  results = {"data": {},
             "results": {
                 "label_wise": {},
                 "overall": {}
             }}

  for key_index, key in enumerate(list(data.keys())):
    results["data"][key] = []
    print(f"Key {key_index + 1} / {len(list(data.keys())) - 1}")
    for obj_index, obj in enumerate(data[key]):
      print(f"Sample {obj_index+1}/{len(data[key])}")
      premise = obj['sample']
      head = obj['h'][0]
      tail = obj['t'][0]
      head_type = obj['h'][2]
      tail_type = obj['t'][2]
      golden_standard = key
      obj_for_current_sample = {"premise": premise,
                                "head": obj['h'],
                                "tail": obj['t']}

      current_relations = possible_relations[head_type][tail_type]
      if len(current_relations) != 0:
        current_templates = get_current_templates(current_relations, head_type, tail_type)

        hypothesis_list = [f"{head} {x} {tail}" for x in current_templates]

        final_prediction = predict(model_name, model, tokenizer, premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, obj_for_current_sample)
      else:
        obj_for_current_sample["hypotheses"] = []
        obj_for_current_sample["prediction"] = "NOTA"
        final_prediction = "NOTA"

      results["data"][key].append(obj_for_current_sample)

      if key == "NOTA":
        if final_prediction == golden_standard:
          nota_green_samples_counter += 1
        else:
          nota_red_samples_counter += 1
      else:
        if final_prediction == golden_standard:
          green_samples_counter += 1
        else:
          red_samples_counter += 1

      samples_counter += 1

    results["results"]["label_wise"][key] = {
        "samples_per_class": obj_index + 1
    }

  print(f"Green samples count: {green_samples_counter}")
  print(f"Red samples count: {red_samples_counter}")
  print(f"NOTA green samples count: {nota_green_samples_counter}")
  print(f"NOTA red samples count: {nota_red_samples_counter}")

  indexes = []
  precisions = []
  recalls = []
  f1_scores = []
  number_of_samples_from_class = []

  for label_to_evaluate in results["data"].keys():
    keywise_one_hot_encoded_vector = []
    predictions_for_current_label = []
    indexes.append(label_to_evaluate)
    for key in results["data"].keys():
      keywise_one_hot_encoded_vector += [1 if label_to_evaluate == key else 0 for i in range(results["results"]["label_wise"][key]["samples_per_class"])]
      predictions_for_current_label += [1 if sample["prediction"] == label_to_evaluate else 0 for sample in results["data"][key]]

    key_precision = precision_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    key_recall = recall_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    key_f1_score = f1_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    samples_per_class = results["results"]["label_wise"][label_to_evaluate]['samples_per_class']
    precisions.append(key_precision)
    recalls.append(key_recall)
    f1_scores.append(key_f1_score)
    number_of_samples_from_class.append(samples_per_class)

    results['results']['label_wise'][label_to_evaluate] = {"precision": key_precision,
                                                      "recall": key_recall,
                                                      "f1_score": key_f1_score,
                                                      "samples_per_class": samples_per_class}


  non_NOTA_macro_weighted_precision, \
  non_NOTA_macro_weighted_recall, \
  non_NOTA_macro_weighted_f1_score, \
  NOTA_macro_weighted_precision, \
  NOTA_macro_weighted_recall, \
  NOTA_macro_weighted_f1_score = macro_weighted_averages(results).values()

  results["results"]["overall"] = {
        "correctly_classified_labels_of_interest": green_samples_counter,
        "incorrectly_classified_labels_of_interest": red_samples_counter,
        "correctly_classified_NOTA_samples": nota_green_samples_counter,
        "incorrectly_classified_NOTA_samples": nota_red_samples_counter,
        "non_NOTA_macro_weighted_precision": non_NOTA_macro_weighted_precision,
        "non_NOTA_macro_weighted_recall": non_NOTA_macro_weighted_recall,
        "non_NOTA_macro_weighted_f1_score": non_NOTA_macro_weighted_f1_score,
        "NOTA_macro_weighted_precision": NOTA_macro_weighted_precision,
        "NOTA_macro_weighted_recall": NOTA_macro_weighted_recall,
        "NOTA_macro_weighted_f1_score": NOTA_macro_weighted_f1_score
  }

  with open(f'nli_results/results_{model_name}.json', 'w') as fp:
    json.dump(results, fp)

  df = pd.DataFrame({"precision": precisions,
                    "recall": recalls,
                    "f1_score": f1_scores,
                    "num of samples": number_of_samples_from_class},
                    index=(indexes)).sort_values('f1_score', ascending=False)

  return df

In [None]:
df = run_experiment("deberta", 'nli_util/CGDC_3.0_updated.json')

#Ensemble testing

In [None]:
def predict_ensemble(models, tokenizers, premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, obj_for_current_sample):
  bart_object = {}
  deberta_object = {}
  t5_object = {}

  bart_prediction = predict("bart", models["bart"], tokenizers["bart"], premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, bart_object)
  deberta_prediction = predict("deberta", models["deberta"], tokenizers["deberta"], premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, deberta_object)
  t5_prediction = predict("t5-xxl", models["t5-xxl"], tokenizers["t5-xxl"], premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, t5_object)

  predictions = {}
  predictions[bart_prediction] = 1 if bart_prediction not in predictions else predictions[bart_prediction] + 1
  predictions[deberta_prediction] = 1 if deberta_prediction not in predictions else predictions[deberta_prediction] + 1
  predictions[t5_prediction] = 1 if t5_prediction not in predictions else predictions[t5_prediction] + 1

  if len(predictions.keys()) == 1:
    final_prediction = list(predictions.keys())[0]
  elif len(predictions.keys()) == 2:
    final_prediction = ""
    max_counter = 0
    for candidate_label in predictions.keys():
      if predictions[candidate_label] > max_counter:
        final_prediction = candidate_label
        max_counter = predictions[candidate_label]
  else:
    candidate_labels = [label for label in list(predictions.keys()) if label != "NOTA"]
    final_prediction = choose_final_label(head_type, tail_type, candidate_labels)

  obj_for_current_sample |= {"bart_predictions": bart_object,
                             "deberta_predictions": deberta_object,
                             "t5_predictions": t5_object,
                             "prediction": final_prediction}

  return final_prediction

def run_experiment_ensemble(dataset_path):
  tokenizer_bart, model_bart = get_current_model("bart")
  tokenizer_deberta, model_deberta = get_current_model("deberta")
  tokenizer_t5_xxl, model_t5_xxl = get_current_model("t5-xxl")

  models = {"bart": model_bart,
            "deberta": model_deberta,
            "t5-xxl": model_t5_xxl}

  tokenizers = {"bart": tokenizer_bart,
                "deberta": tokenizer_deberta,
                "t5-xxl": tokenizer_t5_xxl}

  with open(dataset_path) as infile:
    data = json.load(infile)

  samples_counter = 0
  green_samples_counter = 0
  red_samples_counter = 0
  nota_green_samples_counter = 0
  nota_red_samples_counter = 0
  results = {"data": {},
             "results": {
                 "label_wise": {},
                 "overall": {}
             }}

  for key_index, key in enumerate(list(data.keys())):
    results["data"][key] = []
    print(f"Key {key_index + 1} / {len(list(data.keys())) - 1}")
    for obj_index, obj in enumerate(data[key]):
      print(f"Sample {obj_index+1}/{len(data[key])}")
      premise = obj['sample']
      head = obj['h'][0]
      tail = obj['t'][0]
      head_type = obj['h'][2]
      tail_type = obj['t'][2]
      golden_standard = key
      obj_for_current_sample = {"premise": premise,
                                "head": obj['h'],
                                "tail": obj['t']}

      current_relations = possible_relations[head_type][tail_type]
      if len(current_relations) != 0:
        current_templates = get_current_templates(current_relations, head_type, tail_type)

        hypothesis_list = [f"{head} {x} {tail}" for x in current_templates]

        final_prediction = predict_ensemble(models, tokenizers, premise, hypothesis_list, current_relations, current_templates, head, tail, head_type, tail_type, obj_for_current_sample)
      else:
        obj_for_current_sample["hypotheses"] = []
        obj_for_current_sample["prediction"] = "NOTA"
        final_prediction = "NOTA"

      results["data"][key].append(obj_for_current_sample)

      if key == "NOTA":
        if final_prediction == golden_standard:
          nota_green_samples_counter += 1
        else:
          nota_red_samples_counter += 1
      else:
        if final_prediction == golden_standard:
          green_samples_counter += 1
        else:
          red_samples_counter += 1

      samples_counter += 1

    results["results"]["label_wise"][key] = {
        "samples_per_class": obj_index + 1
    }

  print(f"Green samples count: {green_samples_counter}")
  print(f"Red samples count: {red_samples_counter}")
  print(f"NOTA green samples count: {nota_green_samples_counter}")
  print(f"NOTA red samples count: {nota_red_samples_counter}")

  indexes = []
  precisions = []
  recalls = []
  f1_scores = []
  number_of_samples_from_class = []

  for label_to_evaluate in results["data"].keys():
    keywise_one_hot_encoded_vector = []
    predictions_for_current_label = []
    indexes.append(label_to_evaluate)
    for key in results["data"].keys():
      keywise_one_hot_encoded_vector += [1 if label_to_evaluate == key else 0 for i in range(results["results"]["label_wise"][key]["samples_per_class"])]
      predictions_for_current_label += [1 if sample["prediction"] == label_to_evaluate else 0 for sample in results["data"][key]]

    key_precision = precision_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    key_recall = recall_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    key_f1_score = f1_score(keywise_one_hot_encoded_vector, predictions_for_current_label, zero_division=0)
    samples_per_class = results["results"]["label_wise"][label_to_evaluate]['samples_per_class']
    precisions.append(key_precision)
    recalls.append(key_recall)
    f1_scores.append(key_f1_score)
    number_of_samples_from_class.append(samples_per_class)

    results['results']['label_wise'][label_to_evaluate] = {"precision": key_precision,
                                                      "recall": key_recall,
                                                      "f1_score": key_f1_score,
                                                      "samples_per_class": samples_per_class}


  non_NOTA_macro_weighted_precision, \
  non_NOTA_macro_weighted_recall, \
  non_NOTA_macro_weighted_f1_score, \
  NOTA_macro_weighted_precision, \
  NOTA_macro_weighted_recall, \
  NOTA_macro_weighted_f1_score = macro_weighted_averages(results).values()

  results["results"]["overall"] = {
        "correctly_classified_labels_of_interest": green_samples_counter,
        "incorrectly_classified_labels_of_interest": red_samples_counter,
        "correctly_classified_NOTA_samples": nota_green_samples_counter,
        "incorrectly_classified_NOTA_samples": nota_red_samples_counter,
        "non_NOTA_macro_weighted_precision": non_NOTA_macro_weighted_precision,
        "non_NOTA_macro_weighted_recall": non_NOTA_macro_weighted_recall,
        "non_NOTA_macro_weighted_f1_score": non_NOTA_macro_weighted_f1_score,
        "NOTA_macro_weighted_precision": NOTA_macro_weighted_precision,
        "NOTA_macro_weighted_recall": NOTA_macro_weighted_recall,
        "NOTA_macro_weighted_f1_score": NOTA_macro_weighted_f1_score
  }

  with open(f'results_ensemble_demoREAL.json', 'w') as fp:
    json.dump(results, fp)

  df = pd.DataFrame({"precision": precisions,
                    "recall": recalls,
                    "f1_score": f1_scores,
                    "num of samples": number_of_samples_from_class},
                    index=(indexes)).sort_values('f1_score', ascending=False)

  return df

In [None]:
df_ensemble = run_experiment_ensemble('nli_util/CGDC_3.0_updated.json')

#KG Creation

In [None]:
with open('results_ensemble.json') as f:
  data = json.load(f)

entity_counter = {}
entities = {}
relations = set()
for key in data['data'].keys():
  for sample in data['data'][key]:
    if sample['prediction'] != "NOTA":
      if sample['head'][1] != "N/A":
        entity_counter[sample['head'][1]] = 1 if sample['head'][1] not in entity_counter else entity_counter[sample['head'][1]] + 1
        entities[sample['head'][1]] = {
            "name": sample['head'][0],
            "type": sample['head'][2]
        }
      if sample['tail'][1] != "N/A":
        entity_counter[sample['tail'][1]] = 1 if sample['tail'][1] not in entity_counter else entity_counter[sample['tail'][1]] + 1
        entities[sample['tail'][1]] = {
            "name": sample['tail'][0],
            "type": sample['tail'][2]
        }
      if sample['head'][1] != "N/A" and sample['tail'][1] != "N/A":
        relations.add(f"{sample['head'][1]}#{sample['tail'][1]}#{sample['prediction']}")

def sortt(a):
  return a[1]

stats = [(k, v) for k, v in entity_counter.items()]
stats.sort(reverse=True, key=sortt)

In [None]:
# Function to generate Cypher code for creating relations
def generate_cypher_relations(entities, relationships):
    cypher_statements = ["CREATE"]
    for id in entities.keys():
        cypher = f'({id}:{entities[id]["type"]} {{ID: "{id}", name: "{entities[id]["name"]}"}}),'
        cypher_statements.append(cypher)
    for relationship in relationships:
        head_id, tail_id, relation = relationship.split("#")
        cypher = f"({head_id})-[:{relation}]-> ({tail_id}),"
        cypher_statements.append(cypher)
    return "\n".join(cypher_statements)

# Generate and print the Cypher code
cypher = generate_cypher_relations(entities, relations)

print(cypher)


CREATE
(Q104822257:Location {ID: "Q104822257", name: "Promenade Penzance"}),
(Q17644344:Misc {ID: "Q17644344", name: "Bleu Bridge inscribed stone"}),
(Q5617932:Location {ID: "Q5617932", name: "Gulval"}),
(Q30524253:Location {ID: "Q30524253", name: "King's Hall"}),
(Q22677638:Misc {ID: "Q22677638", name: "Ned and donkey, Pool Park"}),
(Q496368:Location {ID: "Q496368", name: "Wrexham"}),
(Q20594046:Misc {ID: "Q20594046", name: "Cofiwch Dryweryn"}),
(Q881062:Location {ID: "Q881062", name: "Blaenau Ffestiniog"}),
(Q2745177:Misc {ID: "Q2745177", name: "National Eisteddfod"}),
(Q1186888:Location {ID: "Q1186888", name: "Denbigh"}),
(Q748065:Location {ID: "Q748065", name: "Caerphilly"}),
(Q207176:Location {ID: "Q207176", name: "Monmouthshire"}),
(Q675460:Location {ID: "Q675460", name: "Machynlleth"}),
(Q2233618:Location {ID: "Q2233618", name: "Llanrwst"}),
(Q66362702:Location {ID: "Q66362702", name: "London Road Presbyterian Church"}),
(Q25:Location {ID: "Q25", name: "Wales"}),
(Q29492351:Loca