In [1]:
import nltk

# Télécharger WordNet dans un répertoire spécifique
nltk.download('wordnet', download_dir='/kaggle/working//nltk_data')

# Ajouter le chemin des données téléchargées
nltk.data.path.append('/kaggle/working//nltk_data')

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working//nltk_data...


In [3]:
import zipfile
import os

# Chemin du fichier ZIP
zip_path = "/kaggle/working/nltk_data/corpora/wordnet.zip"
extract_path = "/kaggle/working/nltk_data/corpora/"  # Chemin où extraire les fichiers

# Décompression
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Contenu extrait dans {extract_path}")


Contenu extrait dans /kaggle/working/nltk_data/corpora/


In [4]:
# Tester si WordNet fonctionne
from nltk.corpus import wordnet as wn
print(wn.synsets('bank'))

[Synset('bank.n.01'), Synset('depository_financial_institution.n.01'), Synset('bank.n.03'), Synset('bank.n.04'), Synset('bank.n.05'), Synset('bank.n.06'), Synset('bank.n.07'), Synset('savings_bank.n.02'), Synset('bank.n.09'), Synset('bank.n.10'), Synset('bank.v.01'), Synset('bank.v.02'), Synset('bank.v.03'), Synset('bank.v.04'), Synset('bank.v.05'), Synset('deposit.v.02'), Synset('bank.v.07'), Synset('trust.v.01')]


In [50]:
import os
import csv
import math
import random
import logging
import argparse
import itertools
from tabulate import tabulate
from collections import namedtuple

import torch
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm
import time
from sklearn.metrics import precision_score
import numpy as np
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    BertModel, BertConfig, BertPreTrainedModel,
    BertTokenizer, AdamW, get_linear_schedule_with_warmup
)

In [20]:
# Configuration des journaux
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Structures de données
GlossSelectionRecord = namedtuple("GlossSelectionRecord", ["guid", "sentence", "sense_keys", "glosses", "targets"])
BertInput = namedtuple("BertInput", ["input_ids", "input_mask", "segment_ids", "label_id"])

In [21]:
class WSDDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __getitem__(self, index):
        return self.features[index]

    def __len__(self):
        return len(self.features)

In [22]:
class BertWSD(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.ranking_linear = torch.nn.Linear(config.hidden_size, 1)
        self.init_weights()

def _compute_weighted_loss(loss, weighting_factor):
    """Calcul d'une perte pondérée"""
    squared_factor = weighting_factor ** 2
    return 1 / (2 * squared_factor) * loss + math.log(1 + squared_factor)

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Tronque une paire de séquences à la longueur maximale"""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [23]:
def load_dataset(
    csv_path, 
    tokenizer, 
    max_sequence_length, 
    max_samples=None
):
    """
    Charge le jeu de données à partir d'un fichier CSV avec option de sous-échantillonnage.
    
    Args:
        csv_path (str): Chemin vers le fichier CSV
        tokenizer (BertTokenizer): Tokenizer BERT
        max_sequence_length (int): Longueur maximale des séquences
        max_samples (int, optional): Nombre maximal d'échantillons à charger
    
    Returns:
        WSDDataset: Jeu de données pour l'entraînement ou l'évaluation
    """
    def _deserialize_csv_record(row):
        return GlossSelectionRecord(
            row[0],  # guid
            row[1],  # sentence
            eval(row[2]),  # sense_keys
            eval(row[3]),  # glosses
            [int(t) for t in eval(row[4])]  # targets
        )

    def _create_records_from_csv(csv_path, deserialize_fn, max_samples=None):
        """
        Crée des enregistrements à partir d'un fichier CSV avec sous-échantillonnage.
        
        Args:
            csv_path (str): Chemin du fichier CSV
            deserialize_fn (callable): Fonction de désérialisation
            max_samples (int, optional): Nombre maximal d'échantillons
        
        Returns:
            list: Liste d'enregistrements
        """
        records = []
        with open(csv_path, 'r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f)
            next(reader)  # Ignorer l'en-tête
            
            # Utiliser itertools pour limiter les échantillons
            for row in itertools.islice(reader, max_samples):
                records.append(deserialize_fn(row))
        
        return records

    # Charger les enregistrements avec limitation optionnelle
    records = _create_records_from_csv(
        csv_path, 
        _deserialize_csv_record, 
        max_samples
    )
    
    # Convertir en features
    features = _create_features_from_records(
        records, 
        max_sequence_length, 
        tokenizer
    )
    
    # Log du nombre d'échantillons chargés
    logger.info(f"Chargé {len(features)} échantillons depuis {csv_path}")
    
    return WSDDataset(features)

In [25]:
def _create_features_from_records(records, max_seq_length, tokenizer):
    """Convertit les enregistrements en features pour BERT"""
    features = []
    for record in tqdm(records, desc="Conversion des données"):
        tokens_a = tokenizer.tokenize(record.sentence)
        sequences = [(gloss, 1 if i in record.targets else 0) for i, gloss in enumerate(record.glosses)]

        pairs = []
        for seq, label in sequences:
            tokens_b = tokenizer.tokenize(seq)
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

            tokens = tokens_a + ['[SEP]']
            segment_ids = [0] * len(tokens)

            tokens += tokens_b + ['[SEP]']
            segment_ids += [1] * (len(tokens_b) + 1)

            tokens = ['[CLS]'] + tokens
            segment_ids = [0] + segment_ids

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            padding_length = max_seq_length - len(input_ids)
            input_ids += [0] * padding_length
            input_mask += [0] * padding_length
            segment_ids += [0] * padding_length

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            pairs.append(
                BertInput(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label)
            )

        features.append(pairs)

    return features

def collate_batch(batch):
    """Regroupe les lots de données"""
    max_seq_length = len(batch[0][0].input_ids)

    collated = []
    for sub_batch in batch:
        batch_size = len(sub_batch)
        sub_collated = [torch.zeros([batch_size, max_seq_length], dtype=torch.long) for _ in range(3)] + \
                       [torch.zeros([batch_size], dtype=torch.long)]

        for i, bert_input in enumerate(sub_batch):
            sub_collated[0][i] = torch.tensor(bert_input.input_ids, dtype=torch.long)
            sub_collated[1][i] = torch.tensor(bert_input.input_mask, dtype=torch.long)
            sub_collated[2][i] = torch.tensor(bert_input.segment_ids, dtype=torch.long)
            sub_collated[3][i] = torch.tensor(bert_input.label_id, dtype=torch.long)

        collated.append(sub_collated)

    return collated

def forward_gloss_selection(model, batches, device):
    """Effectue une passe avant pour la sélection de gloses"""
    batch_loss = 0
    logits_list = []
    loss_fn = torch.nn.CrossEntropyLoss()

    for batch in batches:
        batch = tuple(t.to(device) for t in batch)
        outputs = model.bert(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2])
        hidden_state = model.dropout(outputs[1])

        logits = model.ranking_linear(hidden_state).squeeze(-1)
        labels = torch.max(batch[3], -1).indices.detach()
        batch_loss += loss_fn(logits.unsqueeze(dim=0), labels.unsqueeze(dim=-1))
        logits_list.append(logits)

    loss = batch_loss / len(batches)
    return loss, logits_list

In [29]:
def train_wsd(train_path, eval_path, output_dir='./results',max_train_samples=None, max_eval_samples=None):
    """
    Entraînement du modèle de sélection de gloses avec sous-échantillonnage.
    
    Args:
        train_path (str): Chemin du fichier CSV d'entraînement
        eval_path (str): Chemin du fichier CSV d'évaluation
        output_dir (str, optional): Répertoire de sauvegarde du modèle
        max_train_samples (int, optional): Nombre maximal d'échantillons d'entraînement
        max_eval_samples (int, optional): Nombre maximal d'échantillons d'évaluation
    """
    # Configuration
    max_seq_length = 128
    batch_size = 8
    num_train_epochs = 3
    learning_rate = 5e-5
    seed = 42

    # Configuration du seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Périphérique
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Utilisation du périphérique : {device}")

    # Modèle et Tokenizer
    model_name = 'bert-base-cased'
    config = BertConfig.from_pretrained(model_name, num_labels=2)
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertWSD.from_pretrained(model_name, config=config)

    # Ajout du token spécial
    if '[TGT]' not in tokenizer.additional_special_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': ['[TGT]']})
        model.resize_token_embeddings(len(tokenizer))

    model.to(device)
    
    # Chargement des données avec sous-échantillonnage optionnel
    train_dataset = load_dataset(
        train_path, 
        tokenizer, 
        max_sequence_length=max_seq_length, 
        max_samples=max_train_samples
    )

    train_sampler = RandomSampler(train_dataset)
    
    train_dataloader = DataLoader(
        train_dataset, 
        sampler=train_sampler, 
        batch_size=batch_size, 
        collate_fn=collate_batch
    )
    
    # Optionnel : charger le jeu de données d'évaluation
    if max_eval_samples is not None:
        eval_dataset = load_dataset(
            eval_path, 
            tokenizer, 
            max_sequence_length=max_seq_length, 
            max_samples=max_eval_samples
        )
    
    # Préparation de l'optimiseur
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 
            'weight_decay': 0.01
        },
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        }
    ]

    total_steps = len(train_dataloader) * num_train_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=total_steps
    )
    
    logger.info("🚀 Entrainement Model Word Sense Disambiguation ")
    logger.info(f"Nombre total de lots de formatio (Batches): {len(train_dataloader)}")
    logger.info(f"Device: {device}")
    
    # Boucle d'entraînement
    for epoch in range(num_train_epochs):
        model.train()
        total_loss = 0

        # Track predictions and labels for precision calculation
        all_preds = []
        all_labels = []

        # Create epoch progress bar
        with tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_train_epochs}", 
                          unit="batch", colour="green") as epoch_iterator:
        
            start_time = time.time()
    
            for step, batches in enumerate(epoch_iterator):
                loss, logits_list = forward_gloss_selection(model, batches, device)

                # Collect predictions and true labels
                for batch_logits, batch in zip(logits_list, batches):
                    # Convert logits to predictions
                    preds = (batch_logits > 0.5).cpu().numpy().astype(int)
                    
                    # Get true labels
                    labels = batch[3].cpu().numpy()
                    
                    all_preds.extend(preds)
                    all_labels.extend(labels)
    
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
                optimizer.step()
                scheduler.step()
    
                total_loss += loss.item()

                # Calculate precision
                try:
                    precision = precision_score(all_labels, all_preds, zero_division=0)
                except:
                    precision = 0
    
                 # Update progress bar with real-time metrics
                epoch_iterator.set_postfix({
                    'Loss': f'{loss.item():.4f}', 
                    'Avg Loss': f'{total_loss/(step+1):.4f}',
                    'Precision': f'{precision:.4f}',
                    'Learning Rate': f'{scheduler.get_last_lr()[0]:.6f}'
                })
    
                # End of epoch summary
                epoch_duration = time.time() - start_time
                # End of epoch summary
                final_precision = precision_score(all_labels, all_preds, zero_division=0)
                logger.info(f"Epoch {epoch+1} completed in {epoch_duration:.2f} seconds. "
                            f"Average Loss: {total_loss/len(train_dataloader):.4f}"
                            f"Precision: {final_precision:.4f}")
    
    
                if step % 100 == 0:
                    logger.info(f"Époque {epoch}, Étape {step}, Perte : {loss.item()}")

    logger.info("✅ Entrainement terminé avec succès")
    """   # Sauvegarde du modèle
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)"""

    # Sauvegarder tout le modèle
    os.makedirs(output_dir, exist_ok=True)
    torch.save(model, '/kaggle/working/modelPytorch/on_modele.pth')
    
    logger.info(f"Modèle entraîné et sauvegardé dans {output_dir}")

In [9]:
# Charger seulement 5000 échantillons pour un test rapide
train_wsd(
    train_path='/kaggle/input/dataset/corpus_dir-max_num_gloss5-augmented.csv', 
    eval_path='/kaggle/input/dataset/semeval2007-max_num_gloss5-augmented.csv', 
    max_train_samples=50000, #40000==4heure      # Limiter à 5000 échantillons
    max_eval_samples=10000  #8000==20%      # Limiter à 1000 échantillons d'évaluation
)

# Charger tous les échantillons (comportement par défaut)
# train_wsd(train_path='/chemin/vers/train.csv', eval_path='/chemin/vers/eval.csv')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertWSD were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['ranking_linear.bias', 'ranking_linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Conversion des données:   0%|          | 0/50000 [00:00<?, ?it/s]

Conversion des données:   0%|          | 0/10000 [00:00<?, ?it/s]



Epoch 1/3:   0%|          | 0/6250 [00:00<?, ?batch/s]

Epoch 2/3:   0%|          | 0/6250 [00:00<?, ?batch/s]

Epoch 3/3:   0%|          | 0/6250 [00:00<?, ?batch/s]

In [30]:
import os
import csv

def write_predictions(output_dir, eval_path, predictions, suffix=None):
    """
    Écrit les prédictions dans un fichier de sortie.
    
    Args:
        output_dir (str): Répertoire de sortie pour les fichiers de prédictions
        eval_path (str): Chemin du fichier d'évaluation original
        predictions (list): Liste des prédictions du modèle
        suffix (str, optional): Suffixe à ajouter au nom du fichier de sortie
    """
    # Créer le répertoire de sortie s'il n'existe pas
    os.makedirs(output_dir, exist_ok=True)
    
    # Générer le nom de fichier de sortie
    base_filename = os.path.splitext(os.path.basename(eval_path))[0]
    output_filename = f"{base_filename}_predictions"
    if suffix:
        output_filename += f"_{suffix}"
    output_filename += ".csv"
    
    # Chemin complet du fichier de sortie
    output_path = os.path.join(output_dir, output_filename)
    
    # Lire le fichier d'évaluation original
    try:
        with open(eval_path, 'r', newline='', encoding='utf-8') as eval_file:
            reader = csv.reader(eval_file)
            original_data = list(reader)
        
        # Ajouter les prédictions aux données originales
        for i, pred in enumerate(predictions):
            if i < len(original_data):
                original_data[i].append(str(pred))
        
        # Écrire les données avec prédictions
        with open(output_path, 'w', newline='', encoding='utf-8') as output_file:
            writer = csv.writer(output_file)
            writer.writerows(original_data)
        
        logger.info(f"Prédictions écrites dans {output_path}")
    
    except Exception as e:
        logger.error(f"Erreur lors de l'écriture des prédictions : {e}")

In [36]:
def evaluate(
    model, 
    tokenizer, 
    eval_path, 
    max_seq_length=128, 
    eval_batch_size=16, 
    output_dir='./results',
    suffix=None,
    max_eval_samples=None  # Nouveau paramètre pour limiter le nombre d'exemples
):
    """
    Évalue un modèle de désambiguïsation lexicale (WSD).
    
    Args:
        model: Modèle de WSD à évaluer
        tokenizer: Tokenizer correspondant au modèle
        eval_path: Chemin vers le jeu de données d'évaluation
        max_seq_length: Longueur maximale des séquences
        eval_batch_size: Taille des lots pour l'évaluation
        output_dir: Répertoire de sortie pour les prédictions
        suffix: Suffixe optionnel pour les fichiers de sortie
        max_eval_samples: Nombre maximal d'exemples à évaluer (optionnel)
    
    Returns:
        float: Perte moyenne d'évaluation
    """
    # Charger le dataset d'évaluation
    eval_dataset = load_dataset(
        eval_path, 
        tokenizer, 
        max_sequence_length=max_seq_length,
        max_samples=max_eval_samples
    )
    
    # Limiter le nombre d'exemples si max_eval_samples est spécifié
    if max_eval_samples is not None:
        eval_dataset = eval_dataset[:max_eval_samples]
        logger.info(f"Nombre d'exemples limité à : {len(eval_dataset)}")
    
    # Créer le DataLoader
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler, 
        batch_size=eval_batch_size,
        collate_fn=collate_batch
    )
    
    # Préparer l'évaluation
    logger.info("***** Début de l'évaluation *****")
    logger.info(f"Nombre d'exemples : {len(eval_dataset)}")
    logger.info(f"Taille des lots : {eval_batch_size}")
    
    # Variables de suivi
    eval_loss = 0.0
    nb_eval_steps = 0
    predictions = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    # Mode évaluation
    model.eval()
    
    # Évaluation
    for batches in tqdm(eval_dataloader, desc="Évaluation en cours"):
        with torch.no_grad():
            # Utiliser votre fonction de perte spécifique
            loss, logits_list = forward_gloss_selection(model, batches, device)
        
        # Collecter les pertes et prédictions
        eval_loss += loss
        predictions.extend([torch.argmax(logits, dim=-1).item() for logits in logits_list])
        nb_eval_steps += 1
    
    # Calculer la perte moyenne
    eval_loss = eval_loss / nb_eval_steps
    
    # Écrire les prédictions
    write_predictions(output_dir, eval_path, predictions, suffix='prediction_v1')
    
    return eval_loss.item()

In [38]:
# Exemple d'utilisation
def main():
    # Charger votre modèle et tokenizer
    model = BertWSD.from_pretrained('/kaggle/working/mesModels')
    tokenizer = BertTokenizer.from_pretrained('/kaggle/working/mesModels')
    
    # Chemin vers votre jeu de données d'évaluation
    eval_path = '/kaggle/input/dataset/semeval2007-max_num_gloss5-augmented.csv'
    
    # Évaluer le modèle
    loss = evaluate(
        model=model, 
        tokenizer=tokenizer, 
        eval_path=eval_path,
        max_seq_length=128,
        eval_batch_size=16,
        output_dir='./evaluation_results',
        max_eval_samples=1000
    )

    print(f"Perte d'évaluation : {loss}")

if __name__ == '__main__':
    main()

Conversion des données:   0%|          | 0/1000 [00:00<?, ?it/s]

Évaluation en cours:   0%|          | 0/63 [00:00<?, ?it/s]

Perte d'évaluation : 2.259589433670044


In [39]:
import re
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

# S'assurer que get_glosses est importée ou définie
from nltk.corpus import wordnet as wn

WORDNET_POS = {'VERB': wn.VERB, 'NOUN': wn.NOUN, 'ADJ': wn.ADJ, 'ADV': wn.ADV}

def get_glosses(lemma, pos):
    """
    Récupère les définitions (glosses) pour un mot donné.
    """
    results = dict()
    wn_pos = WORDNET_POS.get(pos, None) if pos is not None else None
    morphemes = wn._morphy(lemma, pos=wn_pos) if pos is not None else []
    for synset in set(wn.synsets(lemma, pos=wn_pos)):
        sense_key = None
        for l in synset.lemmas():
            if l.name().lower() == lemma.lower():
                sense_key = l.key()
                break
            elif l.name().lower() in morphemes:
                sense_key = l.key()
        if sense_key is not None:
            results[sense_key] = synset.definition()
    return results

In [48]:
def get_predictions(model, tokenizer, sentence):
    re_result = re.search(r"\[TGT\](.*)\[TGT\]", sentence)
    if re_result is None:
        print("\nIncorrect input format. Please try again.")
        return

    ambiguous_word = re_result.group(1).strip()
    sense_keys = []
    definitions = []
    for sense_key, definition in get_glosses(ambiguous_word, None).items():
        sense_keys.append(sense_key)
        definitions.append(definition)

    MAX_SEQ_LENGTH = 128
    record = GlossSelectionRecord("test", sentence, sense_keys, definitions, [-1])
    features = _create_features_from_records([record], MAX_SEQ_LENGTH, tokenizer,
                                             )[0]

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with torch.no_grad():
        logits = torch.zeros(len(definitions), dtype=torch.double).to(device)
        for i, bert_input in tqdm(list(enumerate(features)), desc="Progress"):
            logits[i] = model.ranking_linear(
                model.bert(
                    input_ids=torch.tensor(bert_input.input_ids, dtype=torch.long).unsqueeze(0).to(device),
                    attention_mask=torch.tensor(bert_input.input_mask, dtype=torch.long).unsqueeze(0).to(device),
                    token_type_ids=torch.tensor(bert_input.segment_ids, dtype=torch.long).unsqueeze(0).to(device)
                )[1]
            )
        scores = softmax(logits, dim=0)

    return sorted(zip(sense_keys, definitions, scores), key=lambda x: x[-1], reverse=True)

In [None]:
cls_token=tokenizer.cls_token,
                                             sep_token=tokenizer.sep_token,
                                             cls_token_segment_id=1,
                                             pad_token_segment_id=0,
                                             disable_progress_bar=True

In [51]:
def main():
    # Load fine-tuned model and vocabulary
    print("Loading model...")
    model = BertWSD.from_pretrained("/kaggle/working/mesModels")
    tokenizer = BertTokenizer.from_pretrained("/kaggle/working/mesModels")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    while True:
        sentence = input("\nEnter a sentence with an ambiguous word surrounded by [TGT] tokens\n> ")
        predictions = get_predictions(model, tokenizer, sentence)
        if predictions:
            print("\nPredictions:")
            print(tabulate(
                [[f"{i+1}.", key, gloss, f"{score:.5f}"] for i, (key, gloss, score) in enumerate(predictions)],
                headers=["No.", "Sense key", "Definition", "Score"])
            )
            # for i, (sense_key, definition, score) in enumerate(predictions):
            #     # print(f"  {i + 1:>3}. sense key: {sense_key:<15} score: {score:<8.5f} definition: {definition}")


if __name__ == '__main__':
    main()

Loading model...



Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  



Incorrect input format. Please try again.



Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  He caught a [TGT] bass [TGT] yesterday.


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]


Predictions:
  No.  Sense key            Definition                                                                                             Score
-----  -------------------  ---------------------------------------------------------------------------------------------------  -------
    1  bass%1:06:02::       the member with the lowest range of a family of musical instruments                                  0.50121
    2  bass%1:05:00::       nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes               0.44852
    3  bass%1:13:01::       any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)  0.01779
    4  bass%1:07:01::       the lowest part of the musical range                                                                 0.01545
    5  bass%1:13:02::       the lean flesh of a saltwater fish of the family Serranidae                                          0.00991
    6  bass%1:10:01::      


Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  She went to the [TGT] bank [TGT] to deposit money


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/18 [00:00<?, ?it/s]


Predictions:
  No.  Sense key       Definition                                                                                                                           Score
-----  --------------  ---------------------------------------------------------------------------------------------------------------------------------  -------
    1  bank%1:14:00::  a financial institution that accepts deposits and channels the money into lending activities                                       0.57912
    2  bank%1:06:00::  a building in which the business of banking transacted                                                                             0.41623
    3  bank%2:40:00::  put into a bank account                                                                                                            0.00458
    4  bank%2:40:02::  do business with a bank or keep an account at a bank                                                                               6e-05
    5  bank%2:40


Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  We have a [TGT] date [TGT] for dinner tomorrow night


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/13 [00:00<?, ?it/s]


Predictions:
  No.  Sense key       Definition                                                                                                 Score
-----  --------------  -------------------------------------------------------------------------------------------------------  -------
    1  date%2:41:00::  go on a date with                                                                                         0.9923
    2  date%2:31:00::  assign a date to; determine the (probable) date of                                                        0.0047
    3  date%1:14:00::  a meeting arranged in advance                                                                             0.0029
    4  date%1:18:00::  a participant in a date                                                                                   4e-05
    5  date%1:28:00::  the specified day of the month                                                                            3e-05
    6  date%1:28:05::  a particular 


Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  I will have a [TGT]meeting[TGT] this night


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/6 [00:00<?, ?it/s]


Predictions:
  No.  Sense key          Definition                                                         Score
-----  -----------------  ---------------------------------------------------------------  -------
    1  meeting%1:14:00::  a formally arranged gathering                                    0.95642
    2  meeting%1:11:00::  a casual or unexpected convergence                               0.04358
    3  meeting%1:04:00::  the social act of assembling for some common purpose             0
    4  meeting%1:14:01::  a small informal social gathering                                0
    5  meeting%1:04:02::  the act of joining together as one                               0
    6  meeting%1:15:00::  a place where things merge or flow together (especially rivers)  0



Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  i go to [TGT]market[TGT]


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/9 [00:00<?, ?it/s]


Predictions:
  No.  Sense key         Definition                                                                       Score
-----  ----------------  -----------------------------------------------------------------------------  -------
    1  market%1:04:00::  the world of commercial activity where goods and services are bought and sold  0.95843
    2  market%1:14:00::  the customers for a particular product or service                              0.03516
    3  market%1:06:00::  a marketplace where groceries are sold                                         0.00627
    4  market%2:40:00::  engage in the commercial promotion, sale, or distribution of                   0.0001
    5  market%1:06:01::  an area in a town where a public mercantile establishment is set up            2e-05
    6  market%2:40:05::  buy household supplies                                                         2e-05
    7  market%2:40:01::  deal in a market                                                      


Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  the [TGT] date [TGT] of the meeting was postponed.


Conversion des données:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/13 [00:00<?, ?it/s]


Predictions:
  No.  Sense key       Definition                                                                                                 Score
-----  --------------  -------------------------------------------------------------------------------------------------------  -------
    1  date%1:28:00::  the specified day of the month                                                                           0.79789
    2  date%2:31:00::  assign a date to; determine the (probable) date of                                                       0.20169
    3  date%1:28:05::  a particular day specified as the time something happens                                                 0.00038
    4  date%1:28:02::  the present                                                                                              2e-05
    5  date%1:28:04::  a particular but unspecified point in time                                                               1e-05
    6  date%1:28:03::  the particular 


Enter a sentence with an ambiguous word surrounded by [TGT] tokens
>  quit



Incorrect input format. Please try again.


KeyboardInterrupt: Interrupted by user