In [2]:
import json
from datasets import load_from_disk, load_dataset
from IPython.display import Markdown, display

In [4]:
dataset_path = "./datasets/eager-rain-77/SREDFM-dataset:v11/test/"
test_dataset = load_from_disk(dataset_path)

In [7]:
print(test_dataset[11]["text"])

### Instruction:
You are an expert in data science and natural language processing (NLP).
Your task is to extract entities from the text provided below.
Entities are the subject and object of a sentence, the list of entities must be in the form:
['entity1', 'entity2', 'entity3', ...]
Text: Henk Prinsen (born 25 December 1951) is a Dutch racing cyclist. He rode in the 1974 Tour de France.

### Response:
Entities: ["1974", "racing cyclist", "1974 Tour de France", "Henk Prinsen", "25 December 1951"]</s>

### Instruction:
You are an expert in data science and natural language processing (NLP).
Your task is to extract triplets from the text provided below.
A knowledge triplet is made up of 2 entities (subject and object) linked by a predicate: 
{"Object": "", "Predicate": "", "Subject": "" }
Multiple triplets must be in list form.

### Response:
Relations: [{"Object": "racing cyclist", "Predicate": "sport", "Subject": "Henk Prinsen"}, {"Object": "25 December 1951", "Predicate": "date of bir

In [3]:
def string_to_dict(test_example):
    string = test_example["prediction"]
    string = string.split("### RELATIONS:\n")[1]
    string = string.replace("</s>", "").replace("\n", "")
    string = string.strip('[] \n')
    relations = string.split("},")
    extracted_relations = []
    for r in relations:
        relation = {}
        r = r.strip("\{\} ")
        k_vs = r.split("', '")
        for k_v in k_vs:
            k, v = tuple(k_v.split("': "))
            relation[k.strip("'\" ")] = v.strip("'\" ")
        extracted_relations.append(relation)
    test_example["prediction_dict"] = extracted_relations
    return test_example

In [16]:
GT_pred_dataset_quantized = load_from_disk("./datasets/GT_pred_dataset_quantized")
GT_pred_dataset_quantized_final = GT_pred_dataset_quantized.map(string_to_dict)
GT_pred_dataset_quantized_final = GT_pred_dataset_quantized_final.rename_column("grand_truth", "ground_truth")
GT_pred_dataset_quantized_final

Dataset({
    features: ['text', 'prompt', 'ground_truth', 'prediction', 'prediction_dict'],
    num_rows: 2495
})

In [17]:
GT_pred_dataset_bfloat16 = load_from_disk("./datasets/GT_pred_dataset_bfloat16")
GT_pred_dataset_bfloat16_final = GT_pred_dataset_quantized.map(string_to_dict)
GT_pred_dataset_bfloat16_final = GT_pred_dataset_bfloat16_final.rename_column("grand_truth", "ground_truth")
GT_pred_dataset_bfloat16_final

Dataset({
    features: ['text', 'prompt', 'ground_truth', 'prediction', 'prediction_dict'],
    num_rows: 2495
})

In [35]:
def calculate_precision(annotation, prediction):
    true_positives = len(set(annotation).intersection(set(prediction)))
    false_positives = len(prediction) - true_positives
    return true_positives / (true_positives + false_positives)

def calculate_recall(annotation, prediction):
    true_positives = len(set(annotation).intersection(prediction))
    false_negatives = len(annotation) - true_positives
    return true_positives / (true_positives + false_negatives)

def calculate_f1_score(annotation, prediction):
    precision = calculate_precision(annotation, prediction)
    recall = calculate_recall(annotation, prediction)
    return 2 * (precision * recall) / (precision + recall)


In [39]:
gt = GT_pred_dataset_bfloat16_final["ground_truth"]
pred = GT_pred_dataset_bfloat16_final["prediction_dict"]
gt

[[{'Object': None,
   'Objet': 'rameuse',
   'Predicate': None,
   'Prédicat': 'sport',
   'Subject': None,
   'Subjet': 'Leonie Pieper'},
  {'Object': None,
   'Objet': 'allemande',
   'Predicate': None,
   'Prédicat': 'country of citizenship',
   'Subject': None,
   'Subjet': 'Leonie Pieper'},
  {'Object': None,
   'Objet': '24 août 1992',
   'Predicate': None,
   'Prédicat': 'date of birth',
   'Subject': None,
   'Subjet': 'Leonie Pieper'}],
 [{'Object': None,
   'Objet': 'astéroïde',
   'Predicate': None,
   'Prédicat': 'instance of',
   'Subject': None,
   'Subjet': '(161278) Cesarmendoza'},
  {'Object': None,
   'Objet': 'ceinture principale',
   'Predicate': None,
   'Prédicat': 'minor planet group',
   'Subject': None,
   'Subjet': '(161278) Cesarmendoza'}],
 [{'Object': 'Site of Special Scientific Interest',
   'Objet': None,
   'Predicate': 'heritage designation',
   'Prédicat': None,
   'Subject': 'Mwyngloddfa Nant-y-cagl (Eaglebrook Mine)',
   'Subjet': None},
  {'Object':

In [44]:
display(Markdown(GT_pred_dataset_bfloat16_final[0]["prompt"]))


Vous êtes un expert en data science et en traitement du langage naturel(NLP).
Votre tâche consiste à extraire les triplets du TEXTE fourni ci-dessous.
Les entité s'agit du sujet et de l'objet d'une phrase, la liste d'entités doit être sous forme:
['entité1', 'entité2', 'entité3', ...]
Un triplet de connaissances est constitué de 2 entités (sujet et objet) liées par un prédicat : 
{"Objet": "","Prédicat": "", "Sujet": "" }
Les triples multiples doivent être sous forme de liste.

### TEXTE:
Leonie Pieper, née le 24 août 1992, est une rameuse allemande.</s>

### RELATIONS:


In [45]:
GT_pred_dataset_bfloat16_final[0]["ground_truth"]

[{'Object': None,
  'Objet': 'rameuse',
  'Predicate': None,
  'Prédicat': 'sport',
  'Subject': None,
  'Subjet': 'Leonie Pieper'},
 {'Object': None,
  'Objet': 'allemande',
  'Predicate': None,
  'Prédicat': 'country of citizenship',
  'Subject': None,
  'Subjet': 'Leonie Pieper'},
 {'Object': None,
  'Objet': '24 août 1992',
  'Predicate': None,
  'Prédicat': 'date of birth',
  'Subject': None,
  'Subjet': 'Leonie Pieper'}]

In [43]:
GT_pred_dataset_bfloat16_final[2]["prediction_dict"]

[{'Object': 'Ceredigion',
  'Objet': None,
  'Predicate': 'located in the administrative territorial entity',
  'Prédicat': None,
  'Subject': 'Mwyngloddfa Nant-y-cagl',
  'Subjet': None}]