In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import CamembertTokenizer, CamembertForTokenClassification
from sklearn.metrics import classification_report
from transformers import CamembertTokenizerFast


In [5]:
class FRDataset(Dataset):
    def __init__(self, filename, tokenizer, max_len):
        self.sentences, self.labels = self.read_tsv(filename)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def read_tsv(self, file_path):
        sentences = []
        labels = []
        sentence = []
        label = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                if line == '\n':
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
                    continue
                splits = line.strip().split('\t')
                sentence.append(splits[1])
                label.append(splits[2])
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(words,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        # Ajustement pour aligner les labels avec les tokens
        labels = [-100 if label is None else label for label in labels]
        encoding['labels'] = labels

        return {key: torch.tensor(val) for key, val in encoding.items()}


In [6]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
model = CamembertForTokenClassification.from_pretrained("./my_awesome_wnut_model/checkpoint-11230")

MAX_LEN = 128  # Ajustez selon la longueur de vos phrases


In [7]:
from transformers import DataCollatorForTokenClassification

dev_dataset = FRDataset("./Data/dev_fr.tsv", tokenizer, MAX_LEN)
test_dataset = FRDataset("./Data/test_fr.tsv", tokenizer, MAX_LEN)


data_collator = DataCollatorForTokenClassification(tokenizer)

# Utilisation dans DataLoader
dev_dataloader = DataLoader(dev_dataset, batch_size=32, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)


In [8]:
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            outputs = model(**{k: v.to(model.device) for k, v in batch.items() if k != 'labels'})
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(batch['labels'].numpy())

    print(classification_report(true_labels, predictions))


In [10]:
'''print("Évaluation sur les données de développement")
evaluate(model, dev_dataloader)

print("Évaluation sur les données de test")
evaluate(model, test_dataloader)'''


Évaluation sur les données de développement


ValueError: too many dimensions 'str'