In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [11]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
from model.Train import Trainer
from model.Loss import LSmoothing
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

## Constants

In [12]:
nb_labels = 2
epochs = 20
learning_rate = 1e-4
batch_size = 32
max_len = 128
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


## Loading Data

In [13]:
path = "data/quora-question-pairs/train.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [14]:
print("Number of question pairs: ", len(df))
print("Number of duplicate question pairs: ", df['is_duplicate'].sum())

df = df.sample(10000)

Number of question pairs:  404290
Number of duplicate question pairs:  149263


In [15]:
# Prétraitement simplifié
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

# Tokenization et préparation des données
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode_questions(question1, question2):
    return tokenizer(question1, question2, max_length=max_len, padding='max_length', truncation=True, return_tensors="pt")

inputs = [encode_questions(q1, q2) for q1, q2 in zip(df['question1'], df['question2'])]
labels = torch.tensor(df['is_duplicate'].values)

input_ids = torch.cat([i['input_ids'] for i in inputs], dim=0)
attention_mask = torch.cat([i['attention_mask'] for i in inputs], dim=0)

train_ids, val_ids, train_mask, val_mask, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.1, random_state=42)

train_data = TensorDataset(train_ids, train_mask, train_labels)
val_data = TensorDataset(val_ids, val_mask, val_labels)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [16]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)
optimizer = AdamW
loss = LSmoothing()
trainer = Trainer()
history = trainer.set_model(model)\
    .set_loader(train_loader, val_loader, None)\
    .set_loss_fn(loss)\
    .set_optimizer(optimizer)\
    .fit(learning_rate, epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the model on cuda...
Training...




In [None]:
train_loss = history['training']['loss']
val_loss = history['validation']['loss']

plt.style.use('ggplot')
plt.figure(figsize=(15,10))
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.legend()
plt.show()