In [1]:
from transformers import pipeline
import math
import torch
import IPython
from typing import Dict
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk

import json
import requests
import time

In [2]:
# Function to parse the generated text and extract the triplets
def extract_triplets_typed(text):
    triplets = []
    relation = ''
    text = text.strip()
    current = 'x'
    subject, relation, object_, object_type, subject_type = '','','','',''
    
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").replace("tp_XX", "").replace("__en__", "").split():
        if token == "<triplet>" or token == "<relation>":
            current = 't'
            if relation != '':
                triplets.append({
                        'Subject': subject.strip(), 
                        # 'head_type': subject_type, 
                        'Predicate': relation.strip(),
                        'Object': object_.strip(), 
                        # 'tail_type': object_type
                })
                relation = ''
            subject = ''
        elif token.startswith("<") and token.endswith(">"):
            if current == 't' or current == 'o':
                current = 's'
                if relation != '':
                    triplets.append({
                        'Subject': subject.strip(), 
                        'head_type': subject_type, 
                        'Predicate': relation.strip(),
                        'Object': object_.strip(), 
                        'tail_type': object_type
                    })
                object_ = ''
                subject_type = token[1:-1]
            else:
                current = 'o'
                object_type = token[1:-1]
                relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '' and object_type != '' and subject_type != '':
        triplets.append({
            'Subject': subject.strip(), 
            # 'head_type': subject_type, 
            'Predicate': relation.strip(),
            'Object': object_.strip(), 
            # 'tail_type': object_type
        })
    return triplets

In [3]:
def get_relation(example: Dict) -> str:
    """
    Extracts and structures relations from a single example within a English dataset.

    Args:
        example (dict): A dictionary containing entities and relations.

    Returns:
        str: A string representation of the extracted relations.

    Example:
        Given an 'example' dictionary containing 'entities' and 'relations', this function
        extracts and structures relations, returning them as a string.

    """
    entities_ls = example["entities"]
    relations = []
    for relation in example["relations"]:
        relation_dict = {}
        object_index = relation["object"]
        relation_dict["Object"] = entities_ls[object_index]["surfaceform"]
        relation_dict["Predicate"] = relation["predicate"]
        subject_index = relation["subject"]
        relation_dict["Subject"] = entities_ls[subject_index]["surfaceform"]
        relations.append(relation_dict)

    return relations

In [4]:
def get_ground_truth(test_example):
    try:
        prompt, _ = tuple(test_example["text"].split("### ENTITES:"))
    except:
        prompt, _ = tuple(test_example["text"].split("### ENTITIES:"))
    prompt+="### RELATIONS:\n"
    test_example["prompt"] = prompt
    test_example["ground_truth"]=get_relation(test_example)
    return test_example

In [5]:
def get_true_positive(test_example):
    gt_ls = test_example["ground_truth"]
    pred_ls = test_example["prediction_dict"]
    true_positive = 0
    for pred in pred_ls:
        if pred in gt_ls:
            true_positive+=1 
    test_example["correct"] = true_positive
    test_example["guess"] = len(test_example["ground_truth"])
    test_example["gold"] = len(test_example["prediction_dict"])
    return test_example

# Test with code from model card

In [6]:
triplet_extractor = pipeline(
    'translation_xx_to_yy', 
    model='Babelscape/mrebel-large', 
    tokenizer='Babelscape/mrebel-large', 
    device_map="auto"
    # device=-1
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
sentences = [
    "Albert Einstein a développé la théorie de la relativité.",
    "La tour Eiffel se trouve à Paris.", 
    "Steve Jobs a cofondé Apple Inc.", 
    "L'eau bout à 100 degrés Celsius.",
    "Malgré les controverses, la théorie de l'évolution, élaborée par Charles Darwin au XIXe siècle, demeure le fondement de la biologie moderne.", 
    "La singularité gravitationnelle au cœur des trous noirs, où la densité atteint l'infini, défie notre compréhension actuelle de la physique théorique.",
    "La neuroplasticité, un concept révolutionnaire dans les neurosciences, suggère que le cerveau peut se remodeler et s'adapter tout au long de la vie.",
    "Les enjeux éthiques entourant la manipulation génétique et le clonage humain exigent une réflexion approfondie de la part de la société contemporaine."
]

for sentence in sentences:
    extracted_text = triplet_extractor.tokenizer.batch_decode(
        [triplet_extractor(
            sentence, 
            decoder_start_token_id=250058, 
            src_lang="fr_XX", 
            tgt_lang="<triplet>", 
            return_tensors=True, 
            return_text=False
        )[0]["translation_token_ids"]]
    )
    extracted_triplets = extract_triplets_typed(extracted_text[0])
    print(extracted_triplets)

[{'Subject': 'Albert Einstein', 'Predicate': 'notable work', 'Object': 'théorie de la relativité'}, {'Subject': 'théorie de la relativité', 'Predicate': 'discoverer or inventor', 'Object': 'Albert Einstein'}]
[{'Subject': 'tour Eiffel', 'Predicate': 'located in the administrative territorial entity', 'Object': 'Paris'}]
[{'Subject': 'Steve Jobs', 'Predicate': 'employer', 'Object': 'Apple Inc.'}, {'Subject': 'Apple Inc.', 'Predicate': 'founded by', 'Object': 'Steve Jobs'}]
[{'Subject': '100 degrés Celsius', 'Predicate': 'duration', 'Object': '100'}]
[{'Subject': 'évolution', 'Predicate': 'discoverer or inventor', 'Object': 'Charles Darwin'}, {'Subject': 'Charles Darwin', 'Predicate': 'field of work', 'Object': 'biologie'}]
[{'Subject': 'singularité gravitationnelle', 'Predicate': 'part of', 'Object': 'trous noirs'}]
[{'Subject': 'neurosciences', 'Predicate': 'studies', 'Object': 'cerveau'}]
[{'Subject': 'clonage humain', 'Predicate': 'subclass of', 'Object': 'manipulation génétique'}]


In [34]:

# We need to use the tokenizer manually since we need special tokens.
# sentence = "Malgré son manque d'expérience, le jeune artiste a peint un tableau extraordinaire qui a ébloui les critiques d'art les plus sévères."
# sentence = "The introduction of co-feedstocks increased the CH4 production"
sentence = "L'Opel GTC Concept, abrégé \"Opel Gran Turismo Coupé Concept\", est un concept car produit par Opel depuis 2007. Il a été dévoilé au Salon de Genève."
# sentence = "The cat eats the mouse."
extracted_text = triplet_extractor.tokenizer.batch_decode(
    [triplet_extractor(
        sentence, 
        decoder_start_token_id=250058, 
        src_lang="fr_XX", 
        tgt_lang="<triplet>", 
        return_tensors=True, 
        return_text=False
    )[0]["translation_token_ids"]]
) # change en_XX for the language of the source.
print(extracted_text[0])

extracted_triplets = extract_triplets_typed(extracted_text[0])
print(extracted_triplets)

tp_XX<triplet> Opel GTC Concept <misc> concept car <misc> instance of <misc> Opel <concept> manufacturer</s>
[{'Subject': 'Opel GTC Concept', 'head_type': 'misc', 'Predicate': 'instance of', 'Object': 'concept car', 'tail_type': 'misc'}, {'Subject': 'Opel GTC Concept', 'Predicate': 'manufacturer', 'Object': 'Opel'}]


In [27]:
def get_prediction(test_example):
    try:
        sentence = test_example["text"].split("### TEXTE:")[1].split("\n\n")[0]
    except:
        sentence = test_example["text"].split("### TEXT:")[1].split("\n\n")[0]
    extracted_text = triplet_extractor.tokenizer.batch_decode(
        [triplet_extractor(
            sentence, 
            decoder_start_token_id=250058, 
            src_lang="fr_XX", 
            tgt_lang="<triplet>", 
            return_tensors=True, 
            return_text=False
        )[0]["translation_token_ids"]]
    ) # change en_XX for the language of the source.
    # print(extracted_text[0])
    triplets = extract_triplets_typed(extracted_text[0])
    # print(triplets)
    test_example["prediction_dict"] = triplets
    return test_example

In [7]:
dataset_path = "/home/xli/Documents/Digital safety/src/notebooks/finetune/datasets/bright-shape-68/SREDFM-dataset:v5/test/"
test_dataset = load_from_disk(dataset_path)
GT_test_dataset = test_dataset.map(get_ground_truth)
GT_test_dataset

Dataset({
    features: ['docid', 'title', 'uri', 'text', 'entities', 'relations', 'prompt', 'ground_truth'],
    num_rows: 2525
})

In [28]:
GT_pred_dataset = GT_test_dataset.map(get_prediction)
GT_pred_dataset

Map:   0%|          | 0/2525 [00:00<?, ? examples/s]

Dataset({
    features: ['docid', 'title', 'uri', 'text', 'entities', 'relations', 'prompt', 'ground_truth', 'prediction_dict'],
    num_rows: 2525
})

In [29]:
metrics_dataset = GT_pred_dataset.map(get_true_positive)

correct = sum(metrics_dataset["correct"])
guess = sum(metrics_dataset["guess"])
gold = sum(metrics_dataset["gold"])

precision = float(correct)/float(guess)
recall = float(correct)/float(gold)
f1_score = 2*precision*recall/(precision+recall)
print(precision, recall, f1_score)

Map:   0%|          | 0/2525 [00:00<?, ? examples/s]

0.8313941188914745 0.6137747930657504 0.706199460916442


In [32]:
print(GT_test_dataset[0]["text"])


Vous êtes un expert en data science et en traitement du langage naturel(NLP).
Votre tâche consiste à extraire les triplets du TEXTE fourni ci-dessous.
Les entité s'agit du sujet et de l'objet d'une phrase, la liste d'entités doit être sous forme:
['entité1', 'entité2', 'entité3', ...]
Un triplet de connaissances est constitué de 2 entités (sujet et objet) liées par un prédicat : 
{"Objet": "","Prédicat": "", "Sujet": "" }
Les triples multiples doivent être sous forme de liste.

### TEXTE:
L'Opel GTC Concept, abrégé "Opel Gran Turismo Coupé Concept", est un concept car produit par Opel depuis 2007. Il a été dévoilé au Salon de Genève.

### ENTITES:
["Opel GTC Concept", "Salon de Genève", "Opel", "2007", "concept car"]

### RELATIONS:
[{"Object": "concept car", "Predicate": "instance of", "Subject": "Opel GTC Concept"}, {"Object": "Opel", "Predicate": "manufacturer", "Subject": "Opel GTC Concept"}]</s>




In [21]:
pred = get_prediction(GT_test_dataset[0])["prediction_dict"]
for i in range(len(pred)):
    if pred[i] in GT_test_dataset[0]["ground_truth"]:
        print("true")

true
true


In [None]:
# Here we set English ("en_XX") as source language. To change the source language swap the first token of the input for your desired language or change to supported language. For catalan ("ca_XX") or greek ("el_EL") (not included in mBART pretraining) you need a workaround:
# tokenizer._src_lang = "ca_XX"

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="fr_XX", tgt_lang="tp_XX") 
# Here we set English ("en_XX") as source language. To change the source language swap the first token of the input for your desired language or change to supported language. For catalan ("ca_XX") or greek ("el_EL") (not included in mBART pretraining) you need a workaround:
# tokenizer._src_lang = "ca_XX"
# tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
# tokenizer.set_src_lang_special_tokens("ca_XX")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")

# device = "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"

gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
    "forced_bos_token_id": None,
}

# Text to extract triplets from
# text = 'Le chat mange la souris.'
text = "le chercheur renommé a présenté sa recherche novatrice sur l'apprentissage automatique"

# Tokenizer text
model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt').to(device)

# Generate
model = model.to(device)
generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
    **gen_kwargs,
)

# Extract text
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

# Extract triplets
for idx, sentence in enumerate(decoded_preds):
    print(f'Prediction triplets sentence {idx}')
    print(extract_triplets_typed(sentence))


# Test with medium post

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations


In [None]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
            
    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

In [None]:
def from_small_text_to_kb(text, verbose=False):
    kb = KB()
    
    tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="fr_XX", tgt_lang="tp_XX") 
    model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")

#     device = "cpu"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt').to(device)

    # Generate
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3,
        "forced_bos_token_id": None,
    }

    model = model.to(device)
    generated_tokens = model.generate(
        model_inputs["input_ids"].to(model.device),
        attention_mask=model_inputs["attention_mask"].to(model.device),
        decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
        **gen_kwargs,
    )

    # Extract text
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        print(sentence_pred)
        relations = extract_triplets_typed(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [None]:
def from_text_to_kb(text, span_length=128, verbose=False):
    kb = KB()
    
    tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="fr_XX", tgt_lang="tp_XX") 
    model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")

#     device = "cpu"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt").to(device)

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    model = model.to(device)
    generated_tokens = model.generate(
        decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_triplets_typed(sentence_pred)
        print(relations)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb

In [None]:
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 " \
"May 1821), and later known by his regnal name Napoleon I, was a French military " \
"and political leader who rose to prominence during the French Revolution and led " \
"several successful campaigns during the Revolutionary Wars. He was the de facto " \
"leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, " \
"he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's " \
"political and cultural legacy has endured, and he has been one of the most " \
"celebrated and controversial leaders in world history."

kb = from_small_text_to_kb(text, verbose=True)
kb.print()

In [None]:
text = """
Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 May 1821), and later known by his regnal name Napoleon I, was a French military and political leader who rose to prominence during the French Revolution and led several successful campaigns during the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy has endured, and he has been one of the most celebrated and controversial leaders in world history. Napoleon was born on the island of Corsica not long after its annexation by the Kingdom of France.[5] He supported the French Revolution in 1789 while serving in the French army, and tried to spread its ideals to his native Corsica. He rose rapidly in the Army after he saved the governing French Directory by firing on royalist insurgents. In 1796, he began a military campaign against the Austrians and their Italian allies, scoring decisive victories and becoming a national hero. Two years later, he led a military expedition to Egypt that served as a springboard to political power. He engineered a coup in November 1799 and became First Consul of the Republic. Differences with the British meant that the French faced the War of the Third Coalition by 1805. Napoleon shattered this coalition with victories in the Ulm Campaign, and at the Battle of Austerlitz, which led to the dissolving of the Holy Roman Empire. In 1806, the Fourth Coalition took up arms against him because Prussia became worried about growing French influence on the continent. Napoleon knocked out Prussia at the battles of Jena and Auerstedt, marched the Grande Armée into Eastern Europe, annihilating the Russians in June 1807 at Friedland, and forcing the defeated nations of the Fourth Coalition to accept the Treaties of Tilsit. Two years later, the Austrians challenged the French again during the War of the Fifth Coalition, but Napoleon solidified his grip over Europe after triumphing at the Battle of Wagram. Hoping to extend the Continental System, his embargo against Britain, Napoleon invaded the Iberian Peninsula and declared his brother Joseph King of Spain in 1808. The Spanish and the Portuguese revolted in the Peninsular War, culminating in defeat for Napoleon's marshals. Napoleon launched an invasion of Russia in the summer of 1812. The resulting campaign witnessed the catastrophic retreat of Napoleon's Grande Armée. In 1813, Prussia and Austria joined Russian forces in a Sixth Coalition against France. A chaotic military campaign resulted in a large coalition army defeating Napoleon at the Battle of Leipzig in October 1813. The coalition invaded France and captured Paris, forcing Napoleon to abdicate in April 1814. He was exiled to the island of Elba, between Corsica and Italy. In France, the Bourbons were restored to power. However, Napoleon escaped Elba in February 1815 and took control of France.[6][7] The Allies responded by forming a Seventh Coalition, which defeated Napoleon at the Battle of Waterloo in June 1815. The British exiled him to the remote island of Saint Helena in the Atlantic, where he died in 1821 at the age of 51. Napoleon had an extensive impact on the modern world, bringing liberal reforms to the many countries he conquered, especially the Low Countries, Switzerland, and parts of modern Italy and Germany. He implemented liberal policies in France and Western Europe.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

## Load PDF

In [None]:
import os, glob
import pypdf
from pathlib import Path

In [None]:
def extract_text(file_path: Path):
    with open(file_path, "rb") as fp:
        # Create a PDF object
        pdf = pypdf.PdfReader(fp)
        # Get the number of pages in the PDF document
        num_pages = len(pdf.pages)

        # Iterate over every page
        docs = []
        for page in range(num_pages):
            # Extract the text from the page
            page_text = pdf.pages[page].extract_text()
            if len(page_text)!=0:
                page_label = pdf.page_labels[page]
                metadata = {"page_label": page_label, "file_name": file_path.name}
                docs.append({"text":page_text, "metadata":metadata})
        return docs

In [None]:
def get_pdf_filepaths(folder_path):
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in glob.glob(root+"/*.pdf"):
            file_paths.append(os.path.join(root, file))
    return file_paths

In [None]:
doc_path = Path("/mnt/c/Users/xli.ASSYSTEM/Documents/Digital safety/data/Données Digital Safety")
file_paths = get_pdf_filepaths(doc_path)
text = []
for file in file_paths:
    docs = extract_text(Path(file))
    if len(docs)!=0:
        text = docs.copy()
        break
print(text)


In [None]:
# test = "L’exploitant veillera notamment à ce que le personnel affecté aux opérations de MAD/DEM et de RCD de l’usine UP1 possède les aptitudes professionnelles normalement requises, ait reçu une formation appropriée et dispose des habilitations et des moyens de surveillance (individuels ou collectifs) adaptés aux risques présentés par les opérations réalisées."
# test = "Le chat mange la souris"
test = f"""Malgré les défis économiques et politiques, l'entreprise a réussi à développer une technologie innovante qui pourrait transformer l'industrie."""
time_start=time.time()

kb = from_small_text_to_kb(test, verbose=True)
time_end=time.time()
print('time cost',time_end-time_start,'s')
kb.print()

In [None]:
import requests
prompt = f"""
>>CONTEXT<<
Tu est un expert en NLP, tu as pour tâche d'extraire les triples (sujet, prédicat, objet) présenté dans des phrases données.
Voici un exemple :
Phrase : "Le chien mange une croquette."
Sujet : Le chien
Prédicat : mange
Objet : une croquette
Voici un autre exemple plus compliqué : 
Phrase : "le chercheur renommé a présenté sa recherche novatrice sur l'apprentissage automatique"
Sujet : Le chercheur
Prédicat : a présenté
Objet : sa recherche
>>QUESTION<< Quel est le sujet, le prédicat, et l'objet dans cette phrase : {test} ?  
>>ANSWER<<
"""
data = {"prompt": prompt, "temperature": 0.1}
time_start=time.time()
res = requests.post("http://127.0.0.1:8080/v1/models/model:predict", json=data)
time_end=time.time()
print('time cost',time_end-time_start,'s')
print(res.json()["data"]["generated_text"])

## Reader

In [None]:
response_reader = "/mnt/c/Users/xli.ASSYSTEM/Documents/Digital safety/data/response_UP1-RG-SUR-0288_1_UP1_PT_VF.json"

with open(response_reader, 'r') as json_file:
    response = json.load(json_file)
    print(response)


## Test with REDFM

In [None]:
import sys
sys.path.append("../../")
from rebel.src.score import re_score
import itertools

In [None]:
def convert_relation(example):
    relations = []
    for relation in example['relations']:
        tail = (relation['object']['start'],relation['object']['end'])
        head = (relation['subject']['start'],relation['subject']['end'])
        predicate = RELATION_NAMES[relation['predicate']]
        relations.append(
            {
                "head": (relation['subject']['start'],relation['subject']['end']),
                "tail": (relation['object']['start'],relation['object']['end']),
                "head_type": relation['subject']['type'],
                "tail_type": relation['subject']['type'],
                "type": RELATION_NAMES[relation['predicate']]
            }
        )
    return relations

In [None]:
def predict_relation(text, tokenizer, model, verbose=False):
#     device = "cpu"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt').to(device)

    # Generate
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3,
        "forced_bos_token_id": None,
    }

    model = model.to(device)
    generated_tokens = model.generate(
        model_inputs["input_ids"].to(model.device),
        attention_mask=model_inputs["attention_mask"].to(model.device),
        decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
        **gen_kwargs,
    )

    # Extract text
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    relations = []
    for sentence_pred in decoded_preds:
        print(sentence_pred)
        relations.append(extract_triplets_typed(sentence_pred))
    # Extract triplets
    for idx, sentence in enumerate(decoded_preds):
        print(f'Prediction triplets sentence {idx}')
        re = extract_triplets_typed(sentence)
#         print(len(re))
#     relations = list(itertools.chain(*relations))
    return relations

In [None]:
from datasets import load_dataset
dataset = load_dataset("Babelscape/REDFM", 'fr')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large", src_lang="fr_XX", tgt_lang="tp_XX") 
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large")

In [None]:
dataset["validation"][0]["text"]

In [None]:
predict_relation(dataset["validation"][0]["text"], tokenizer, model, verbose=True)

In [None]:
gold_relations = [convert_relation(example) for example in dataset["validation"]]
gold_relations = list(itertools.chain(*gold_relations))

In [None]:
[re["predicate"] for re in dataset["validation"][0]["relations"]]

In [None]:
pred_relations = [from_small_text_to_kb(example["text"]).relations for example in dataset["validation"]]

In [None]:
dataset["validation"][0]

In [None]:
dataset["validation"][0]["relations"]

In [1]:
RELATION_NAMES=['country', 'place of birth', 'spouse', 'country of citizenship', 'instance of',
            'capital', 'child', 'shares border with', 'author', 'director', 'occupation',
              'founded by', 'league', 'owned by', 'genre', 'named after', 'follows',
                'headquarters location', 'cast member', 'manufacturer',
                  'located in or next to body of water', 'location', 'part of', 
                  'mouth of the watercourse', 'member of', 'sport', 'characters',
                    'participant', 'notable work', 'replaces', 'sibling', 'inception']

In [None]:
def get_relation(example):

    relations = []
    for relation in example['relations']:
        object = relation['object']['surfaceform']
        subject = relation['subject']['surfaceform']
        predicate = RELATION_NAMES[relation['predicate']]
        relations.append(f"[’{subject}’, ’{predicate}’, ’{object}’]")

 
    return ' | '.join(relations)

In [None]:
def get_entity(example):

    relations = []
    for relation in example['entities']:
        object = relation['object']['surfaceform']
        subject = relation['subject']['surfaceform']
        predicate = RELATION_NAMES[relation['predicate']]
        relations.append(f"[’{subject}’, ’{predicate}’, ’{object}’]")

 
    return ' | '.join(relations)

In [None]:
example = dataset["validation"][0].copy()
print(get_relation(example))