<a href="https://colab.research.google.com/github/OdysseusPolymetis/ia_et_shs/blob/main/5_postagging_with_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install conllu datasets transformers

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from datasets import Dataset
from transformers import AdamW
from sklearn.metrics import accuracy_score

In [None]:
model_name = "bowphs/GreTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder_model = AutoModel.from_pretrained(model_name)

# Vérifiez le modèle
print(encoder_model)

In [None]:
!curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5787{/ud-treebanks-v2.15.tgz,/ud-documentation-v2.15.tgz,/ud-tools-v2.15.tgz}

In [None]:
!tar -xvzf ud-treebanks-v2.15.tgz

In [None]:
from conllu import parse

def load_conllu(filename):
    """
    Charge un fichier .conllu et extrait les tokens et leurs tags.
    Retourne deux listes :
    - sentences : liste des listes de mots (tokens)
    - taggings : liste des listes de tags (UPOS)
    """
    with open(filename, "r", encoding="utf-8") as fp:
        data = parse(fp.read())

    sentences = []
    taggings = []

    for sentence in data:
        tokens = [token['form'] for token in sentence]
        tags = [token['upostag'] for token in sentence]

        # Ajouter uniquement les phrases valides (tokens et tags alignés)
        if tokens and tags and len(tokens) == len(tags):
            sentences.append(tokens)
            taggings.append(tags)

    return sentences, taggings

In [None]:
base_path = "/content/ud-treebanks-v2.15/UD_Ancient_Greek-PROIEL/"
train_sentences, train_tags = load_conllu(base_path + "grc_proiel-ud-train.conllu")
dev_sentences, dev_tags = load_conllu(base_path + "grc_proiel-ud-dev.conllu")
test_sentences, test_tags = load_conllu(base_path + "grc_proiel-ud-test.conllu")

# Exemple d'affichage
print("Exemple de phrase :", train_sentences[0])
print("Tags correspondants :", train_tags[0])

Exemple de phrase : ['Ἡροδότου', 'Ἁλικαρνησσέος', 'ἱστορίης', 'ἀπόδεξις', 'ἥδε', 'ὡς', 'μήτε', 'τὰ', 'γενόμενα', 'ἐξ', 'ἀνθρώπων', 'τῷ', 'χρόνῳ', 'ἐξίτηλα', 'γένηται', 'μήτε', 'ἔργα', 'μεγάλα', 'τε', 'καὶ', 'θωμαστά', 'τὰ', 'μὲν', 'Ἕλλησι', 'τὰ', 'δὲ', 'βαρβάροισι', 'ἀποδεχθέντα', 'ἀκλεᾶ', 'γένηται', 'τά', 'τε', 'ἄλλα', 'καὶ', 'δι’', 'ἣν', 'αἰτίην', 'ἐπολέμησαν', 'ἀλλήλοισι']
Tags correspondants : ['PROPN', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'SCONJ', 'CCONJ', 'DET', 'VERB', 'ADP', 'NOUN', 'DET', 'NOUN', 'ADJ', 'VERB', 'CCONJ', 'NOUN', 'ADJ', 'CCONJ', 'CCONJ', 'ADJ', 'PRON', 'ADV', 'NOUN', 'PRON', 'ADV', 'NOUN', 'VERB', 'ADJ', 'VERB', 'DET', 'CCONJ', 'PRON', 'CCONJ', 'ADP', 'PRON', 'NOUN', 'VERB', 'PRON']


In [None]:
def prepare_data(sentences, tags, tokenizer):
    """
    Prépare les données pour l'entraînement de l'encodeur.
    """
    tag2id = {tag: idx for idx, tag in enumerate(set(tag for tag_seq in tags for tag in tag_seq))}
    id2tag = {idx: tag for tag, idx in tag2id.items()}
    max_length = 128  # Limite de longueur

    inputs = []
    outputs = []

    for tokens, tag_seq in zip(sentences, tags):
        # Tokenisation des tokens
        tokenized = tokenizer(tokens, is_split_into_words=True, truncation=True, padding="max_length", max_length=max_length)

        # Alignement des tags
        word_ids = tokenized.word_ids()  # Récupérer les indices de mots après tokenisation
        aligned_tags = [-100 if word_id is None else tag2id[tag_seq[word_id]] for word_id in word_ids]

        inputs.append(tokenized)
        outputs.append(aligned_tags)

    return inputs, outputs, tag2id, id2tag

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    encoder_model.resize_token_embeddings(len(tokenizer))  # Ajuster les embeddings du modèle

In [None]:
train_inputs, train_outputs, tag2id, id2tag = prepare_data(train_sentences, train_tags, tokenizer)
dev_inputs, dev_outputs, _, _ = prepare_data(dev_sentences, dev_tags, tokenizer)

In [None]:
from torch.utils.data import DataLoader, Dataset
import torch

class PosDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.inputs[idx]["input_ids"])
        attention_mask = torch.tensor(self.inputs[idx]["attention_mask"])
        labels = torch.tensor(self.outputs[idx])
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [None]:
train_dataset = PosDataset(train_inputs, train_outputs)
dev_dataset = PosDataset(dev_inputs, dev_outputs)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=64)

In [None]:
total_batches = 0
total_examples = 0

for batch in dev_dataloader:
    total_batches += 1
    total_examples += batch["input_ids"].size(0)

print(f"Nombre de batchs : {total_batches}")
print(f"Nombre total d'exemples dans dev_dataloader : {total_examples}")

Nombre de batchs : 32
Nombre total d'exemples dans dev_dataloader : 1019


In [None]:
class PosTagger(nn.Module):
    def __init__(self, encoder, hidden_dim, num_labels):
        super(PosTagger, self).__init__()
        self.encoder = encoder.encoder  # Utiliser uniquement l'encodeur de T5
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask):
        # Passer uniquement par l'encodeur
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = encoder_outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        logits = self.classifier(embeddings)  # [batch_size, seq_len, num_labels]
        return logits

In [None]:
hidden_dim = encoder_model.config.hidden_size
num_labels = len(tag2id)
pos_tagger = PosTagger(encoder_model, hidden_dim, num_labels)

In [None]:
from transformers import AdamW
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pos_tagger.to(device)

# Optimiseur et fonction de perte
optimizer = AdamW(pos_tagger.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

num_epochs=3
from transformers import get_scheduler

num_training_steps = len(train_dataloader) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Dans la boucle d'entraînement
for epoch in range(num_epochs):
    pos_tagger.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = pos_tagger(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        scheduler.step()  # Met à jour le learning rate

        total_loss += loss.item()
    print(f"Époque {epoch + 1}/{num_epochs}, Perte moyenne :", total_loss / len(train_dataloader))



Époque 1/3, Perte moyenne : 1.4974161515844628


In [None]:
# Évaluer sur dev_data
pos_tagger.eval()
total_accuracy = 0
total_examples = 0

with torch.no_grad():
    for batch in dev_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = pos_tagger(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(logits, dim=-1)

        # Calculer l'accuracy
        for pred, label in zip(predictions, labels):
            mask = label != -100
            total_accuracy += (pred[mask] == label[mask]).sum().item()
            total_examples += mask.sum().item()

accuracy = total_accuracy / total_examples
print(len(predictions))
print(f"Accuracy sur dev_data : {accuracy:.4f}")


In [None]:
def tag_sentence(sentence, model, tokenizer, id2tag):
    """
    Interroge le modèle sur une phrase donnée pour obtenir les tags PoS.

    Args:
    - sentence (str): La phrase à taguer.
    - model (nn.Module): Le modèle PoS tagger.
    - tokenizer (AutoTokenizer): Le tokenizer associé.
    - id2tag (dict): Mapping des IDs des tags vers leurs étiquettes textuelles.

    Returns:
    - tokens (list): Liste des tokens de la phrase.
    - predicted_tags (list): Liste des tags prédits pour chaque token.
    """

    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Prédictions
    model.eval()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(logits, dim=-1)  # [batch_size, seq_len]

    # Décoder les tokens et leurs tags correspondants
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=True)
    predicted_ids = predictions[0].cpu().numpy()

    # Associer les tokens aux tags prédits
    predicted_tags = [id2tag[tag_id] for tag_id in predicted_ids if tag_id in id2tag]

    # Filtrer pour correspondre uniquement aux tokens d'origine (éviter les sous-tokens)
    word_ids = inputs.word_ids()  # Correspondance entre les indices de mots et les tokens
    tokens_filtered = []
    tags_filtered = []
    for i, word_id in enumerate(word_ids):
        if word_id is not None and (i == 0 or word_id != word_ids[i - 1]):  # Prendre uniquement le premier token d'un mot
            tokens_filtered.append(tokens[i])
            tags_filtered.append(predicted_tags[i])

    return tokens_filtered, tags_filtered


sentence = "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ πλάγχθη."
tokens, tags = tag_sentence(sentence, pos_tagger, tokenizer, id2tag)

# Résultat
print("Phrase :", sentence)
print("Tokens :", tokens)
print("Tags prédits :", tags)

Phrase : Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ πλάγχθη.
Tokens : ['▁ἄνδρα', '▁μοι', '▁ἔννεπ', ',', '▁μοῦσα', ',', '▁πολύτροπο', ',', '▁ὃ', '▁μάλα', '▁πολλὰ', '▁πλά', '.']
Tags prédits : ['NOUN', 'PRON', 'VERB', 'ADP', 'NOUN', 'ADP', 'ADJ', 'ADP', 'PRON', 'ADV', 'ADJ', 'VERB', 'VERB']
