# *Notebook* à utiliser pour faire le travail pratique # 3 sur l'analyse d'incidents.





## Imports

In [1]:
# Importation des bibliothèques nécessaires
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
import torch
import json
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from collections import Counter
import string
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import re
from bleurt import score
import pandas as pd
import time
from statistics import mean




## Chargements Modèles & Tokenizers & Données

In [2]:
# Initialisation du modèle et du tokenizer pour t5-base
model_t5_name = "t5-Large"
model_t5 = T5ForConditionalGeneration.from_pretrained(model_t5_name)
tokenizer = T5Tokenizer.from_pretrained(model_t5_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
# Vérification de la disponibilité du GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_t5.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [4]:
# Préparation questions
labels=['text', 'arguments.EVENT', 'arguments.ACTIVITY', 'arguments.WHO','arguments.WHERE', 'arguments.WHEN', 'arguments.CAUSE','arguments.EQUIPMENT', 'arguments.INJURY', 'arguments.INJURED','arguments.BODY-PARTS', 'arguments.DEATH', 'arguments.SUBSTANCE']

# Chargement des données

In [5]:
def load_incidents(filename):
    with open(filename, 'r') as fp:
        incident_list = json.load(fp)
        df = pd.json_normalize(incident_list)
    return df

file_path = 'data/dev_examples.json'
data=load_incidents(file_path)
print("Nombre d'incidents:", len(data))
print("\nExtrait :\n", data.iloc[-1])

Nombre d'incidents: 100

Extrait :
 text                     On September 21  2009  Employee #1  an asphal...
arguments.EVENT                                     [he experienced pain]
arguments.ACTIVITY                           [climbing  into the machine]
arguments.WHO                               [Employee #1, asphalt roller]
arguments.WHERE                        [hospitalized for hernia surgery.]
arguments.WHEN                                       [September 21  2009]
arguments.CAUSE            [pain and a pulling sensation in his  abdomen]
arguments.EQUIPMENT                                                    []
arguments.INJURY        [pain and a pulling sensation in his  abdomen,...
arguments.INJURED                                           [Employee #1]
arguments.BODY-PARTS                                            [abdomen]
arguments.DEATH                                                        []
arguments.SUBSTANCE                                                   NaN
Na

# Fonctions Annexes

In [6]:
def normalize_answer(s):
    """Mettre en minuscule et retirer la ponctuation, des déterminants and les espaces."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [7]:
def f1_score(prediction, ground_truth):
    """Normalise les 2 textes, trouve ce qu'il y a en comment et estime précision, rappel et F1."""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
        return int(ground_truth_tokens == prediction_tokens)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [8]:
def exact_match_score(prediction, ground_truth): 
    """Vérifie si les 2 textes sont quasi-identiques."""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

In [9]:
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    """La fonction principale. Important de noter que ground_truths est une liste 
       parce qu'il peut y avoir plusieurs réponses possibles."""
    scores_for_ground_truths = []
    if ground_truths == "":
        return 1 if prediction == "" else 0
    else:
        for ground_truth in ground_truths:
            score = metric_fn(prediction, ground_truth)
            scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

In [10]:
def evaluate_model(model, val_loader, device):
    model.eval()
    total_f1, total_exact_match, total_count = 0, 0, 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Création des decoder_input_ids pour T5
            decoder_input_ids = torch.full_like(labels, tokenizer.pad_token_id)
            decoder_input_ids[:, 0] = tokenizer.eos_token_id

            # Appel du modèle avec decoder_input_ids
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            for pred, label in zip(preds, labels):
                pred_text = tokenizer.decode(pred, skip_special_tokens=True)
                label_text = tokenizer.decode(label, skip_special_tokens=True)
                total_f1 += metric_max_over_ground_truths(f1_score, pred_text, [label_text])
                total_exact_match += metric_max_over_ground_truths(exact_match_score, pred_text, [label_text])
                total_count += 1

    return total_f1 / total_count, total_exact_match / total_count

In [15]:
def make_predictions(model, tokenizer, test_df, questions):
    model.eval()
    pred_df = pd.DataFrame(columns=['text'] + questions)

    pred_df['text'] = test_df['text'].values
    for index, row in test_df.iterrows():
        context = row['text']
        for question in questions:
            input_text = f"question: {question} context: {context}"
            input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
            outputs = model.generate(input_ids)
            pred_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
            pred_df.loc[index, question] = pred_answer

    return pred_df

# Évaluation

In [11]:
def evaluate_predictions(df_predictions, df_truths):
    f1_l = []
    ex_l = []
    for index_row, row in df_truths.iterrows():
        for (index_col, colname) in enumerate(df_truths):
            colsNonPred = ['text', 'f1', 'ex']
            if colname not in colsNonPred:
                # Passer chaque paire de prédictions
                f1 = metric_max_over_ground_truths(f1_score, df_predictions[colname].iloc[index_row], df_truths[colname].iloc[index_row])
                f1_l.append(f1)
                ex = metric_max_over_ground_truths(exact_match_score, df_predictions[colname].iloc[index_row], df_truths[colname].iloc[index_row])
                ex_l.append(ex)

    # Prendre la moyenne pour chaque métrique
    avg_f1 = mean(f1_l)
    avg_exact_match = mean(ex_l)

    # Évaluation avec BLEU, ROUGE et BLEURT
    references = df_truths['text'].tolist()
    candidates = df_predictions['text'].tolist()

    # BLEU
    bleu_score = sentence_bleu([references], candidates)

    # ROUGE
    rouge = Rouge()
    rouge_score = rouge.get_scores(candidates, references, avg=True)

    # BLEURT
    bleurt_scorer = score.BleurtScorer("chemin_vers_bleurt_checkpoint")
    bleurt_score = bleurt_scorer.score(references=references, candidates=candidates)

    return avg_f1, avg_exact_match, bleu_score, rouge_score, bleurt_score

In [16]:
questions = [
    "What was the unexpected event described in the report?", # arguments.EVENT
    "What activity was being performed during the incident?", # arguments.ACTIVITY
    "Who was involved in the incident?", # arguments.WHO
    "Where did the incident occur?", # arguments.WHERE
    "When did the incident take place?", # arguments.WHEN
    "What was the cause of the incident?", # arguments.CAUSE
    "What equipment was involved in the incident?", # arguments.EQUIPMENT
    "What type of injury occurred?", # arguments.INJURY
    "Who was injured in the incident?", # arguments.INJURED
    "Which body parts were affected?", # arguments.BODY-PARTS
    "Was there any fatality reported?", # arguments.DEATH
    "Was there any hazardous substance involved?" # arguments.SUBSTANCE
]

In [17]:
# Utilisation de la fonction make_predictions et des fonctions d'évaluation
df_predictions = make_predictions(model_t5, tokenizer, data, questions)
evaluate_predictions(df_predictions, data)

