In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

In [3]:
# Fichiers pour l'entraînement (choisissez un fichier parmi ceux de training_datasets_en)
TRAIN_DATA_PATH = "/kaggle/input/data-set/training_datasets_en/semcor_en/semcor_en.data.xml"  # ou un autre dataset
TRAIN_LABELS_PATH = "/kaggle/input/data-set/training_datasets_en/semcor_en/semcor_en.gold.key.txt"

# Fichiers pour la validation
DEV_DATA_PATH = "/kaggle/input/data-set/evaluation_datasets_en/dev-en/dev-en.data.xml"
DEV_LABELS_PATH = "/kaggle/input/data-set/evaluation_datasets_en/dev-en/dev-en.gold.key.txt"

# Fichiers pour les tests
TEST_DATA_PATH = "/kaggle/input/data-set/evaluation_datasets_en/test-en/test-en.data.xml"
TEST_LABELS_PATH = "/kaggle/input/data-set/evaluation_datasets_en/test-en/test-en.gold.key.txt"

# Fichiers exemaple et glosses
EXAMPLE_DATA_PATH = "/kaggle/input/data-set/training_datasets_en/wngt_examples_en/wngt_examples_en.data.xml"
EXAMPLE_LABELS_PATH = "/kaggle/input/data-set/training_datasets_en/wngt_examples_en/wngt_examples_en.gold.key.txt"
GLOSSES_DATA_PATH = "/kaggle/input/data-set/training_datasets_en/wngt_glosses_en/wngt_glosses_en.data.xml"
GLOSSES_LABELS_PATH = "/kaggle/input/data-set/training_datasets_en/wngt_glosses_en/wngt_glosses_en.gold.key.txt"

In [4]:
def xml_to_dataframe(data_path, labels_path):
    # Charger le fichier XML
    tree = ET.parse(data_path)
    root = tree.getroot()

    sentences = []
    labels = {}

    # Charger les labels depuis le fichier .gold.key.txt
    with open(labels_path, 'r') as f:
        for line in f:
            instance_id, *label = line.strip().split()
            labels[instance_id] = label  # Chaque instance_id peut avoir plusieurs labels

    # Parcourir les phrases et extraire les tokens à désambiguïser
    for text in root.findall(".//text"):
        for sentence in text.findall(".//sentence"):
            sentence_text = ""
            for element in sentence:
                if element.tag == "wf":
                    # Mots normaux
                    sentence_text += element.text + " "
                elif element.tag == "instance":
                    # Mots à désambiguïser
                    word = element.text
                    instance_id = element.attrib['id']
                    lemma = element.attrib.get('lemma', "")
                    pos = element.attrib.get('pos', "")
                    sentence_text += word + " "

                    # Ajouter au DataFrame
                    sentences.append({
                        "sentence": sentence_text.strip(),
                        "word": word,
                        "lemma": lemma,
                        "pos": pos,
                        "instance_id": instance_id,
                        "labels": labels.get(instance_id, [])
                    })
    
    # Convertir en DataFrame
    return pd.DataFrame(sentences)

In [5]:
train_df = xml_to_dataframe(TRAIN_DATA_PATH, TRAIN_LABELS_PATH)
train_df.head(30)

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,How long,long,long,ADJ,d000.s000.t000,[bn:00106124a]
1,How long has it been,been,be,VERB,d000.s000.t001,[bn:00083181v]
2,How long has it been since you reviewed,reviewed,review,VERB,d000.s000.t002,[bn:00092618v]
3,How long has it been since you reviewed the ob...,objectives,objective,NOUN,d000.s000.t003,[bn:00002179n]
4,How long has it been since you reviewed the ob...,benefit,benefit,NOUN,d000.s000.t004,[bn:00009904n]
5,How long has it been since you reviewed the ob...,service,service,NOUN,d000.s000.t005,[bn:00070654n]
6,How long has it been since you reviewed the ob...,program,program,NOUN,d000.s000.t006,[bn:00064646n]
7,Have you permitted,permitted,permit,VERB,d000.s001.t000,[bn:00082536v]
8,Have you permitted it to become,become,become,VERB,d000.s001.t001,[bn:00083294v]
9,Have you permitted it to become a giveaway,giveaway,giveaway,NOUN,d000.s001.t002,[bn:00040564n]


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226036 entries, 0 to 226035
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   sentence     226036 non-null  object
 1   word         226036 non-null  object
 2   lemma        226036 non-null  object
 3   pos          226036 non-null  object
 4   instance_id  226036 non-null  object
 5   labels       226036 non-null  object
dtypes: object(6)
memory usage: 10.3+ MB


In [7]:
test_df = xml_to_dataframe(TEST_DATA_PATH, TEST_LABELS_PATH)
test_df.head() 

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,The art,art,art,NOUN,senseval2.d000.s000.t000,[bn:00005928n]
1,The art of change-ringing,change-ringing,change_ringing,NOUN,senseval2.d000.s000.t001,[bn:00017671n]
2,The art of change-ringing is peculiar,peculiar,peculiar,ADJ,senseval2.d000.s000.t002,"[bn:00108295a, bn:00108382a]"
3,The art of change-ringing is peculiar to the E...,English,english,NOUN,senseval2.d000.s000.t003,[bn:00030863n]
4,The art of change-ringing is peculiar to the E...,most,most,ADJ,senseval2.d000.s000.t004,[bn:00106953a]


In [8]:
dev_df = xml_to_dataframe(DEV_DATA_PATH, DEV_LABELS_PATH)
dev_df.head()

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,Your Oct. 6 editorial `` The Ill Homeless `` r...,referred,refer,VERB,d000.s000.t000,[bn:00082412v]
1,Your Oct. 6 editorial `` The Ill Homeless `` r...,research,research,NOUN,d000.s000.t001,[bn:00067280n]
2,Your Oct. 6 editorial `` The Ill Homeless `` r...,reported,report,VERB,d000.s000.t002,[bn:00092823v]
3,Your comments,comments,comment,NOUN,d000.s001.t000,[bn:00020977n]
4,Your comments implied,implied,imply,VERB,d000.s001.t001,[bn:00085636v]


In [9]:
example_df = xml_to_dataframe(EXAMPLE_DATA_PATH, EXAMPLE_LABELS_PATH)
example_df.head()

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,"it was full of rackets , balls and other objects",objects,object,NOUN,d0000001.s001.h001,[bn:00058442n]
1,how big is that part compared to the whole,whole,whole,NOUN,d0000002.s001.h001,[bn:00079109n]
2,the team is a unit,unit,unit,NOUN,d0000003.s001.h001,[bn:00079109n]
3,"lard was also used , though its congener",congener,congener,NOUN,d0000004.s001.h001,[bn:00021806n]
4,the American shopkeeper differs from his Europ...,congener,congener,NOUN,d0000005.s001.h001,[bn:00021806n]


In [10]:
glosses_df = xml_to_dataframe(GLOSSES_DATA_PATH, GLOSSES_LABELS_PATH)
glosses_df.head()

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,entity,entity,entity,NOUN,d0000001.s001.h001,[bn:00031027n]
1,entity : that which is perceived,perceived,perceive,VERB,d0000001.s001.t002,[bn:00091540v]
2,entity : that which is perceived or known,known,known,ADJ,d0000001.s001.t003,[bn:00105645a]
3,entity : that which is perceived or known or i...,inferred,infer,VERB,d0000001.s001.t004,[bn:00086431v]
4,entity : that which is perceived or known or i...,distinct,distinct,ADJ,d0000001.s001.t005,[bn:00099815a]


In [11]:
print(f"Nombre d'exemples dans l'entraînement : {len(train_df)}")
print(f"Nombre d'exemples dans le test : {len(test_df)}")
print(f"Nombre d'exemples dans le dev : {len(dev_df)}")
print(f"Nombre d'exemples dans le example : {len(example_df)}")
print(f"Nombre d'exemples dans le glosses : {len(glosses_df)}")

Nombre d'exemples dans l'entraînement : 226036
Nombre d'exemples dans le test : 8062
Nombre d'exemples dans le dev : 455
Nombre d'exemples dans le example : 47825
Nombre d'exemples dans le glosses : 566610


In [12]:
train_df.sample(10)

Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
42174,Other classes are included only by myself ( in...,only,only,ADV,d042.s011.t006,[bn:00114234r]
106459,We first define a function b ( t ) as follows,as follows,as_follows,ADV,d106.s088.t003,[bn:00114359r]
172118,The specific staining by both direct and indir...,pseudophloem,pseudophloem,NOUN,d172.s074.t008,[bn:00064987n]
91565,SBA works closely with the principal property ...,property,property,NOUN,d092.s025.t011,[bn:00009815n]
50748,What one actually,actually,actually,ADV,d051.s008.t000,[bn:00114117r]
104643,"They indicated that a 4 - day retention , aera...",lagoon,lagoon,NOUN,d105.s007.t005,[bn:00049696n]
99654,Remember that in seeking the modern in Utopia ...,not,not,ADV,d100.s020.t004,[bn:00116360r]
211738,Somehow our contemporary Moloch must be induce...,see,see,VERB,d295.s023.t001,[bn:00092443v]
132691,Since * * f and p divides,divides,divide,VERB,d133.s030.t000,[bn:00087098v]
65199,"Even so , he generally listened and was usuall...",reasonable,reasonable,ADJ,d065.s084.t005,[bn:00109507a]


In [13]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226036 entries, 0 to 226035
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   sentence     226036 non-null  object
 1   word         226036 non-null  object
 2   lemma        226036 non-null  object
 3   pos          226036 non-null  object
 4   instance_id  226036 non-null  object
 5   labels       226036 non-null  object
dtypes: object(6)
memory usage: 10.3+ MB


Unnamed: 0,sentence,word,lemma,pos,instance_id,labels
0,How long,long,long,ADJ,d000.s000.t000,[bn:00106124a]
1,How long has it been,been,be,VERB,d000.s000.t001,[bn:00083181v]
2,How long has it been since you reviewed,reviewed,review,VERB,d000.s000.t002,[bn:00092618v]
3,How long has it been since you reviewed the ob...,objectives,objective,NOUN,d000.s000.t003,[bn:00002179n]
4,How long has it been since you reviewed the ob...,benefit,benefit,NOUN,d000.s000.t004,[bn:00009904n]


# 1) Préparation des données

In [23]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import shutil

In [15]:
class WSDDataset(Dataset):
    def __init__(self, sentences, words, labels, tokenizer, max_length):
        self.sentences = sentences
        self.words = words
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        word = self.words[idx]
        
        # Mise en évidence du mot à désambiguïser
        highlighted_sentence = sentence.replace(word, f"[{word}]")
        
        # Encodage avec RoBERTa
        encoding = self.tokenizer(
            highlighted_sentence, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx])
        }

In [None]:
def prepare_wsd_model(num_labels):
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    return model, tokenizer

In [17]:
def delete_previous_checkpoints(checkpoint_dir):
    """Supprime les anciens fichiers de checkpoint."""
    if os.path.exists(checkpoint_dir):
        for file in os.listdir(checkpoint_dir):
            file_path = os.path.join(checkpoint_dir, file)
            if os.path.isfile(file_path):
                os.remove(file_path)

In [49]:
! rm -r /kaggle/working/checkpoints
! rm -r /kaggle/working/download_dir

  pid, fd = os.forkpty()


In [18]:
def train_wsd_model(
    model, train_dataloader, val_dataloader, device, 
    tokenizer, epochs=10, checkpoint_dir='/kaggle/working/checkpoints', download_dir='/kaggle/working/download_dir', MAX_CHECKPOINTS=3
):
    import shutil  # Pour la suppression des anciens checkpoints
    from collections import deque  # Pour gérer un historique des checkpoints
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model.to(device)
    writer = SummaryWriter()

    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(download_dir, exist_ok=True)

    global_step = 0  # Nombre de pas
    saved_checkpoints = deque()  # Historique des checkpoints

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0

        # Utilisation de tqdm pour la barre de progression
        train_loop = tqdm(train_dataloader, desc=f"Époque {epoch+1}/{epochs} - Entraînement")
        for batch in train_loop:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids, 
                attention_mask=attention_mask, 
                labels=labels
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            # Mise à jour de la barre de progression
            train_loop.set_postfix(Loss=loss.item())

            # Sauvegarde périodique
            global_step += 1
            if global_step % 3000 == 0:
                checkpoint_name = f"checkpoint_epoch_{epoch}_step_{global_step}"
                checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name)
                model.save_pretrained(checkpoint_path)
                tokenizer.save_pretrained(checkpoint_path)
                print(f"Checkpoint sauvegardé : {checkpoint_name}")
                
                # Copier dans le répertoire de téléchargement
                download_checkpoint_path = os.path.join(download_dir, checkpoint_name)
                shutil.copytree(checkpoint_path, download_checkpoint_path)
                print(f"Checkpoint copié dans : {download_checkpoint_path}")

                # Ajouter le checkpoint au suivi
                saved_checkpoints.append(download_checkpoint_path)
                
                # Supprimer les anciens checkpoints si MAX_CHECKPOINTS est dépassé
                if len(saved_checkpoints) > MAX_CHECKPOINTS:
                    oldest_checkpoint = saved_checkpoints.popleft()
                    shutil.rmtree(oldest_checkpoint, ignore_errors=True)
                    print(f"Ancien checkpoint supprimé : {oldest_checkpoint}")

        # Validation
        model.eval()
        total_val_loss = 0
        
        val_loop = tqdm(val_dataloader, desc=f"Époque {epoch+1}/{epochs} - Validation")
        with torch.no_grad():
            for batch in val_loop:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids, 
                    attention_mask=attention_mask, 
                    labels=labels
                )
                
                total_val_loss += outputs.loss.item()
                val_loop.set_postfix(Val_Loss=outputs.loss.item())
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_val_loss = total_val_loss / len(val_dataloader)

        writer.add_scalar('Train/Loss', avg_train_loss, epoch)
        writer.add_scalar('Validation/Loss', avg_val_loss, epoch)

        print(f"Époque {epoch+1}/{epochs}")
        print(f"Perte d'entraînement : {avg_train_loss:.4f}")
        print(f"Perte de validation : {avg_val_loss:.4f}")

    writer.close()
    return model

In [24]:
def main(train_df):
    # Convertir les labels de liste à chaîne de caractères
    def clean_label(x):
        if isinstance(x, list):
            return x[0].strip('[]')
        return x.strip('[]')
    
    train_df['cleaned_labels'] = train_df['labels'].apply(clean_label)
    
    # Encoder les labels
    label_encoder = LabelEncoder()
    train_df['encoded_labels'] = label_encoder.fit_transform(train_df['cleaned_labels'])
    
    # Préparation des données
    X = train_df[['sentence', 'word']]
    y = train_df['encoded_labels']
    
    # Division des données
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42
    )
    
    # Configuration du dispositif
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Préparation du modèle
    num_labels = len(label_encoder.classes_)
    model, tokenizer = prepare_wsd_model(num_labels)
    
    # Création des datasets
    max_length = 128
    train_dataset = WSDDataset(
        X_train['sentence'].values, 
        X_train['word'].values, 
        y_train.values, 
        tokenizer, 
        max_length
    )
    val_dataset = WSDDataset(
        X_val['sentence'].values, 
        X_val['word'].values, 
        y_val.values, 
        tokenizer, 
        max_length
    )
    
    # Création des dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    
    # Entraînement du modèle
    trained_model = train_wsd_model(model, train_dataloader, val_dataloader, device, tokenizer)
    
    # Sauvegarde finale
    trained_model.save_pretrained('/kaggle/working/wsd_roberta_model')
    tokenizer.save_pretrained('/kaggle/working/wsd_roberta_tokenizer')
    
    # Mapping des labels originaux
    label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
    
    # Répertoire où sauvegarder le fichier
    output_dir = "/kaggle/working"
    os.makedirs(output_dir, exist_ok=True)  # Crée le répertoire s'il n'existe pas
    
    # Chemin complet pour le fichier
    output_file = os.path.join(output_dir, "label_mapping.json")
    
    # Sauvegarde du mapping
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(label_mapping, f, ensure_ascii=False, indent=4)
    
    print(f"\nMapping des labels sauvegardé dans '{output_file}'.")
    
    # Affichage limité des labels (facultatif)
    print("\nVérification du mapping des labels (affichage des 10 premiers) :")
    for encoded, original in list(label_mapping.items())[:10]:
        print(f"{encoded}: {original}")

    
    return label_encoder, trained_model, tokenizer

In [25]:
# Utilisation
label_encoder, model, tokenizer = main(train_df)

# Optionnel : Vérification du nombre de labels uniques
print(f"Nombre de labels uniques : {len(label_encoder.classes_)}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Époque 1/10 - Entraînement:  27%|██▋       | 2999/11302 [44:06<2:04:36,  1.11it/s, Loss=8.16]

Checkpoint sauvegardé : checkpoint_epoch_0_step_3000


Époque 1/10 - Entraînement:  27%|██▋       | 3000/11302 [44:08<3:21:55,  1.46s/it, Loss=8.16]

Checkpoint copié dans : /kaggle/working/download_dir/checkpoint_epoch_0_step_3000


Époque 1/10 - Entraînement:  53%|█████▎    | 5999/11302 [1:28:27<1:15:19,  1.17it/s, Loss=5.55]

Checkpoint sauvegardé : checkpoint_epoch_0_step_6000


Époque 1/10 - Entraînement:  53%|█████▎    | 6000/11302 [1:28:29<2:03:22,  1.40s/it, Loss=5.55]

Checkpoint copié dans : /kaggle/working/download_dir/checkpoint_epoch_0_step_6000


Époque 1/10 - Entraînement:  66%|██████▌   | 7416/11302 [1:49:23<57:19,  1.13it/s, Loss=7.91]  


KeyboardInterrupt: 