# *Notebook* à utiliser pour faire le travail pratique # 3 sur l'analyse d'incidents.





## Imports

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json
import torch
from torch.utils.data import Dataset
from collections import Counter
import regex as re
import string
import numpy as np

## Chargements Modèles et Tokenizers

In [2]:
# Charger les modèle / Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-large')

# Vérification de la disponibilité du GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device : {device}")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Device : cpu


In [3]:
#load data
file_path = 'data/dev_examples.json'
with open(file_path, 'r') as file:
    data = json.load(file)

In [4]:
def create_input_data(data):
    formatted_data = []

    for item in data:
        text = item['text']
        arguments = item['arguments']
        
        for key, values in arguments.items():
            input_text = f"analyze: {text} <extra_id_0> {key}"
            target_text = f"{values} <extra_id_1>"
            formatted_data.append((input_text, target_text))

    return formatted_data

# Création des question à donner au modèle
dataset = create_input_data(data)

print(dataset[0])



# Fonction d'évaluation des modèles

In [None]:
#Calcule du score d'une modèle
def normalize_answer(s):
    """Mettre en minuscule et retirer la ponctuation, des déterminants and les espaces."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    """Normalise les 2 textes, trouve ce qu'il y a en comment et estime précision, rappel et F1."""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
        return int(ground_truth_tokens == prediction_tokens)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth): 
    """Vérifie si les 2 textes sont quasi-identiques."""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    """La fonction princiaple. Important de noter que ground_truths est une liste 
       parce qu'il peut y avoir plusieurs réponses possibles."""
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

# Modèle de maskage

In [None]:
# Fonction pour compléter un texte avec un masque
def fill_in_the_blank(text):
    input_text = f"fill in the blank: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Modèle Question-réponse

In [None]:
# Fonction pour répondre à une question
def answer_question(question, context):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Génération des réponses
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Exemple d'utilisation
question = "A human is smaller or taller than a cat?"
context = "This revue notice that a cat mesure 50cm in average and a human 1.70m."
output = answer_question(question, context)

print(output)

In [None]:
def evaluate_model(dataset, eval_fn):
    for item in dataset:
        input_text = item[0]
        target_text = item[1]

        output = generate_answer(input_text)
        print(f"Predicted: {output}, Target: {target_text}")
        # Apply evaluation function here if needed
        break

In [None]:
evaluate_model(dataset_qa, model, tokenizer, device)