In [31]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import CamembertTokenizer, CamembertForTokenClassification
from sklearn.metrics import classification_report
from transformers import CamembertTokenizerFast


In [32]:
class FRDataset(Dataset):
    def __init__(self, filename, tokenizer, max_len, percent=100):
        self.sentences, self.labels = self.read_tsv(filename, percent)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def read_tsv(self, file_path, percent):
        sentences = []
        labels = []
        sentence = []
        label = []
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            lines = lines[:int(len(lines) * (percent / 100))]

            for line in lines:
                if line == '\n':
                    if sentence and label:  
                        sentences.append(sentence)
                        labels.append(label)
                    sentence = []
                    label = []
                    continue
                splits = line.strip().split('\t')
                if len(splits) < 3:  
                    continue
                sentence.append(splits[1])
                label.append(splits[2])
            if sentence and label:  
                sentences.append(sentence)
                labels.append(label)
        return sentences, labels
    
    # def __init__(self, filename, tokenizer, max_len):
    #     self.sentences, self.labels = self.read_tsv(filename)
    #     self.tokenizer = tokenizer
    #     self.max_len = max_len
    # 
    # def read_tsv(self, file_path):
    #     sentences = []
    #     labels = []
    #     sentence = []
    #     label = []
    #     with open(file_path, 'r', encoding='utf-8') as file:
    #         for line in file:
    #             if line == '\n':
    #                 sentences.append(sentence)
    #                 labels.append(label)
    #                 sentence = []
    #                 label = []
    #                 continue
    #             splits = line.strip().split('\t')
    #             sentence.append(splits[1])
    #             label.append(splits[2])
    #     return sentences, labels
    
    def __len__(self):
        return len(self.sentences)
    
    
    def __getitem__(self, idx):
        label_map = {'B-PER': 1, 'I-PER': 1}
    
        words = self.sentences[idx]
        labels = self.labels[idx]
    
        encoding = self.tokenizer(words,
                                is_split_into_words=True,
                                return_offsets_mapping=True,
                                padding='max_length',
                                truncation=True,
                                max_length=self.max_len)
    
        # Convertir les labels textuels en valeurs numériques
        numeric_labels = [label_map.get(label, 0) for label in labels]
    
        
       
        while len(numeric_labels) < len(encoding['input_ids']):
            numeric_labels.append(-100)  # -100 est utilisé pour ignorer les tokens lors du calcul de la perte
        
        encoding['labels'] = numeric_labels
    
        # Créer un tenseur pour chaque valeur dans encoding, sauf pour 'offset_mapping'
        return {key: torch.tensor(val) for key, val in encoding.items() if key != 'offset_mapping'}


In [33]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
# model = CamembertForTokenClassification.from_pretrained("./my_awesome_wnut_model/checkpoint-11230") 
model = CamembertForTokenClassification.from_pretrained("./MonModels/checkpoint-67770")

MAX_LEN = 196  

# Afficher les labels
print(model.config.id2label)


{0: 'LABEL_0', 1: 'LABEL_1'}


In [34]:
from transformers import DataCollatorForTokenClassification
 
test_dataset = FRDataset("./data/test_fr.tsv", tokenizer, MAX_LEN, percent=15)
dev_dataset = FRDataset("./data/dev_fr.tsv", tokenizer, MAX_LEN, percent=15)

data_collator = DataCollatorForTokenClassification(tokenizer)

# Utilisation dans DataLoader
dev_dataloader = DataLoader(dev_dataset, batch_size=32, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)


In [35]:
from tqdm import tqdm

def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Évaluation"):
            outputs = model(**{k: v.to(model.device) for k, v in batch.items() if k != 'labels'})
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(batch['labels'].numpy())
            
    return true_labels, predictions



In [36]:
print("Évaluation sur les données de test")
true_labels, predictions = evaluate(model, test_dataloader)

Évaluation sur les données de test


Évaluation:   0%|          | 0/85 [00:00<?, ?it/s]You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Évaluation: 100%|██████████| 85/85 [06:32<00:00,  4.62s/it]


In [37]:
from sklearn.metrics import classification_report
import numpy as np

# Aplatir les listes de predictions et true_labels
flat_predictions = [pred for sublist in predictions for pred in sublist]
flat_true_labels = [true for sublist in true_labels for true in sublist]

# Filtrer les tokens marqués avec -100
filtered_predictions = [pred for pred, true in zip(flat_predictions, flat_true_labels) if true != -100]
filtered_true_labels = [true for true in flat_true_labels if true != -100]

# Calculer et afficher le rapport de classification
print(classification_report(filtered_true_labels, filtered_predictions))


              precision    recall  f1-score   support

           0       0.97      0.95      0.96     65562
           1       0.16      0.22      0.18      2681

    accuracy                           0.92     68243
   macro avg       0.56      0.59      0.57     68243
weighted avg       0.94      0.92      0.93     68243



In [38]:

'''
         precision    recall  f1-score   support

           0       0.97      0.95      0.96     65562
           1       0.16      0.22      0.18      2681

    accuracy                           0.92     68243
   macro avg       0.56      0.59      0.57     68243
weighted avg       0.94      0.92      0.93     68243
'''

'\n         precision    recall  f1-score   support\n\n           0       0.97      0.95      0.96     65562\n           1       0.16      0.22      0.18      2681\n\n    accuracy                           0.92     68243\n   macro avg       0.56      0.59      0.57     68243\nweighted avg       0.94      0.92      0.93     68243\n'

In [39]:
print("Évaluation sur les données de développement")
evaluate(model, dev_dataloader)

Évaluation sur les données de développement


Évaluation: 100%|██████████| 82/82 [05:42<00:00,  4.18s/it]


([array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100

In [40]:
from sklearn.metrics import classification_report
import numpy as np

# Aplatir les listes de predictions et true_labels
flat_predictions = [pred for sublist in predictions for pred in sublist]
flat_true_labels = [true for sublist in true_labels for true in sublist]

# Filtrer les tokens marqués avec -100
filtered_predictions = [pred for pred, true in zip(flat_predictions, flat_true_labels) if true != -100]
filtered_true_labels = [true for true in flat_true_labels if true != -100]

# Calculer et afficher le rapport de classification
print(classification_report(filtered_true_labels, filtered_predictions))


              precision    recall  f1-score   support

           0       0.97      0.95      0.96     65562
           1       0.16      0.22      0.18      2681

    accuracy                           0.92     68243
   macro avg       0.56      0.59      0.57     68243
weighted avg       0.94      0.92      0.93     68243

