In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [None]:
# scp C:/Cours-Sorbonne/M1/S2/RITAL/TME_RI/PROJET/data/quora-question-pairs/train.csv  21318858@ppti-14-302-12:/tempory/M1-DAC-Stage-Tikai7/Github/Weakly-Supervised-Label-Smoothing-BERT/data


In [12]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from model.Train import Trainer
from model.Loss import LSmoothing, WSLSmoothing
from torch.utils.data import DataLoader
from model.Bert import BertForQuestionPairClassification
from model.DataManager import QuoraDataset
from transformers import BertTokenizer
from model.NegativeSampling import RandomSampling, BM25Sampling

## Loading Data

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

path ="data/train.csv"
bs = 32
bm25_sampling = False

data = QuoraDataset.load_data(path, 1000)
data['global_docno'] = data.index.astype(str)
train_data, val_data, test_data = QuoraDataset.split_data(data)

if bm25_sampling : 
    index_ref_tr = QuoraDataset.index_data(train_data,type_df="train_5")
    index_ref_val = QuoraDataset.index_data(val_data,type_df="val_5")
    index_ref_test = QuoraDataset.index_data(test_data,type_df="test_5")
    train_data = BM25Sampling.sample(index_ref_tr,train_data, k=9).sort_values(by="question1")
    val_data = BM25Sampling.sample(index_ref_val,val_data, k=9).sort_values(by="question1")
    test_data = BM25Sampling.sample(index_ref_test,test_data, k=9).sort_values(by="question1")
else:
    train_data = RandomSampling.sample(train_data, k=9).sort_values(by="question1")
    val_data = RandomSampling.sample(val_data, k=9).sort_values(by="question1")
    test_data = RandomSampling.sample(test_data, k=9).sort_values(by="question1")

# val_data = QuoraDataset.remove_overlapping_questions(train_data, val_data)
# test_data = QuoraDataset.remove_overlapping_questions(train_data, test_data)
# val_data = QuoraDataset.remove_overlapping_questions(train_data, val_data, 'question2')
# test_data = QuoraDataset.remove_overlapping_questions(train_data, test_data, 'question2')


train_dataset = QuoraDataset(train_data, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
val_dataset = QuoraDataset(val_data, tokenizer, max_length=128)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)
test_dataset = QuoraDataset(test_data, tokenizer, max_length=128)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

Train data :  872
Val data :  172
Test data :  172


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 10
learning_rate = 1e-4

model = BertForQuestionPairClassification()
model = model.to(device)
optimizer = torch.optim.AdamW
loss = LSmoothing()
trainer = Trainer()
history = trainer.set_model(model)\
    .set_loader(train_loader, val_loader, None)\
    .set_loss_fn(loss)\
    .set_optimizer(optimizer)\
    .fit(learning_rate, epochs, CL=False)
trainer.save_model()

Training the model on cuda...
Training...
Validating...


ZeroDivisionError: division by zero

In [None]:
train_loss = history['training']['loss']
val_loss = history['validation']['loss']

plt.style.use('ggplot')
plt.figure(figsize=(15,10))
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
model = BertForQuestionPairClassification()
model = model.to(device)
model.load_state_dict(torch.load("/kaggle/input/model-wsls/model_wsls.pth"))

def recall_at_k(all_scores, all_targets, K, num_duplicate):
    top_k_indices = np.argsort(all_scores)[::-1][:K]
    top_k_targets = all_targets[top_k_indices]
    recall = np.sum(top_k_targets) / num_duplicate
    return recall

def evaluate_ranking_model(model, data_loader, K, device):
    model.eval()
    model.to(device)
    recalls = []
    with torch.no_grad():
        for inputs, labels, ns_scores in data_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            ns_scores = ns_scores.to(device)
            outputs = model(**inputs)
            scores = torch.softmax(outputs, dim=1)[:, 1]  
            scores = scores.cpu().numpy()
            targets = labels.cpu().numpy()
            num_duplicate = np.sum(targets)  
            if num_duplicate == 0:
                continue  
            recall_k = recall_at_k(scores, targets, K, num_duplicate)
            recalls.append(recall_k)

    avg_recall = np.mean(recalls)
    return {
        f"recall_at_{K}": avg_recall
    }

metrics = evaluate_ranking_model(model, test_loader, K=1, device=device)
print("Ranking Metrics:", metrics)