In [20]:
import pandas as pd
import numpy as np
import os
import re
import sys
import torch

!pip install sentencepiece
!pip install transformers
!pip install wandb

from transformers import T5Tokenizer, T5ForConditionalGeneration
import wandb

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torch.nn.utils.rnn import pad_sequence
from torch import cuda



from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/CQR/

path = "."

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/CQR


Extraction des données de CANARD sous forme de DataFrame (tableau à deux colonnes) pour les donner au modèle.

*   col1 = CONTEXT = Contexte ( Requête non reformulée + requêtes précédentes et réponses associées)
*   col 2 = REWRITE = Requête reformulée à obtenir



In [21]:
def read_trec(year):
	assert year==2020 or year==2021
	test = pd.read_csv(f'{path}/TREC/trec{year}.csv').reset_index()
	return test 

def collectData(): 
  #données entrainement
  train =  pd.read_csv(f'{path}/CANARD/train_.csv')
  #données test
  test = pd.read_csv(f'{path}/CANARD/test_.csv')

  return train[["Context","Rewrite"]], test[["Context","Rewrite"]]


Classe qui utilise les DataFrame (tableau à deux colonnes) collectées précédemment pour les donner au modèle


In [22]:
#Classe pour lire et rendre compréhensible un dataframe et le passer plus tard au modèle
class Dataset_2(Dataset):
	def __init__(self, dataframe, tokenizer, context_len = 512, rewrite_len = 128):
		self.tokenizer = tokenizer
		self.rewrite = dataframe.Rewrite # Questions à obtenir après reformulation
		self.context = dataframe.Context # Contextes

		self.context_len = context_len #Longueur max du contexte pour l'encodage
		self.rewrite_len = rewrite_len #Longueur max de la question réecrite pour l'encodage

	def __len__(self):
		return len(self.context) #Taille du dataset

	#Retourne masque d'attention et input ids de la ligne index du dataframe
	def __getitem__(self, index):
		#Dataframe sous forme de liste puis encoder avec le tokenizer (masque d'attention et input_ids)
		context = str(self.context[index])
		context = ' '.join(context.split())
		context = self.tokenizer.batch_encode_plus([context], max_length= self.context_len, padding='longest',return_tensors='pt',truncation=True)
		context_ids = context['input_ids'].squeeze()
		context_mask = context['attention_mask'].squeeze()
	
		rewrite = str(self.rewrite[index])
		rewrite = ' '.join(rewrite.split())
		rewrite = self.tokenizer.batch_encode_plus([rewrite], max_length= self.rewrite_len, padding='longest',return_tensors='pt',truncation=True)
		rewrite_ids = rewrite['input_ids'].squeeze()
		rewrite_mask = rewrite['attention_mask'].squeeze()

		return {
			'context_ids': context_ids.to(dtype=torch.long), 
			'context_mask': context_mask.to(dtype=torch.long), 
			'rewrite_ids': rewrite_ids.to(dtype=torch.long),
			'rewrite_mask': rewrite_mask.to(dtype=torch.long),
		}

In [23]:
# Recupère les masques d'attentions/input_ids des contextes/reformulation du batch
def my_collate(batch):
	padded_context_ids = pad_sequence([item['context_ids'] for item in batch], batch_first=True)
	padded_context_mask = pad_sequence([item['context_mask'] for item in batch], batch_first=True)
	padded_rewrite_ids = pad_sequence([item['rewrite_ids'] for item in batch], batch_first=True)
	padded_rewrite_mask = pad_sequence([item['rewrite_mask'] for item in batch], batch_first=True)	
	
	batch = [{'context_ids':padded_context_ids[i], 'context_mask':padded_context_mask[i],\
			      'rewrite_ids':padded_rewrite_ids[i]} for i in range(len(padded_context_ids))]
	return default_collate(batch)

In [24]:
# fonction d'entrainement du modèle.
# Inspirer de Nam Le Hai
def train(tokenizer, model, device, loader, optimizer):
	model.train()
	i=1
	for _,data in enumerate(loader, 0):
		y = data['rewrite_ids'].to(device, dtype = torch.long)
		y_ids = y[:, :-1].contiguous()
		labels = y[:, 1:].clone().detach()
		labels[y[:, 1:] == tokenizer.pad_token_id] = -100
		ids = data['context_ids'].to(device, dtype = torch.long)
		mask = data['context_mask'].to(device, dtype = torch.long)

		outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
		loss = outputs[0]
		
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
		i+=1

In [25]:
def reformulate(tokenizer, model, device, loader):
	model.eval()
	predictions = []
	actuals = []
	with torch.no_grad():
		for _, data in enumerate(loader, 0):
			y = data['rewrite_ids'].to(device, dtype = torch.long)
			ids = data['context_ids'].to(device, dtype = torch.long)
			mask = data['context_mask'].to(device, dtype = torch.long)

			generated_ids = model.generate(
				input_ids = ids,
				attention_mask = mask, 
				max_length=128, 
				num_beams=3,
				repetition_penalty=2.5, 
				length_penalty=1.0, 
				early_stopping=True
				)
			preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
			rewrite = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
		
			predictions.extend(preds)
			actuals.extend(rewrite)
	return predictions, actuals


In [26]:
device = 'cuda'

torch.cuda.empty_cache()

#wandb.init(project="pldac")
#wandb.run.name = "CQR"

# Hyperparametres du modèle, wandb les garde en mémoire
#config = wandb.config        

# pytorch random seed
torch.manual_seed(0)

# T5 pour l'encodage du texte
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# dataframe de train et de test
train_df, test_df = collectData()

# dataset de train et de test
training_ds = Dataset_2(train_df, tokenizer)
test_ds = Dataset_2(test_df, tokenizer)

#DataLoaders d'entrainement et de test.
training_dl = DataLoader(training_ds, batch_size = 8, shuffle = True, collate_fn = my_collate)
test_dl = DataLoader(test_ds, batch_size = 8, collate_fn = my_collate)

# Modèle T5-base  
model = T5ForConditionalGeneration.from_pretrained("t5-base",cache_dir="/tmp").to(device)

# Optimizer qui va modifier les poids du réseau pendant l'apprentissage. learning rate de 1e-3 
optimizer = torch.optim.Adam(params =  model.parameters()) #, lr = 1e-3 par défaut

# Log metrics with wandb
# wandb.watch(model, log="all")

for epoch in range(4): 
	# Training 
	print(f'Entraînement époque {epoch}')
	train(tokenizer, model, device, training_dl, optimizer)
	print(f'Fin entraînement époque {epoch}')

	# Validation et sauvegarde des resultats sous forme de DataFrame
	print(f'Génération et sauvegarde des reformulations pour epoch {epoch}' )
	prediction, rewrited = reformulate(tokenizer, model, device, test_dl)
	reformulation_df = pd.DataFrame({'Prediction':prediction,'Rewrite':rewrited})
	outdir = f'{path}/REFORMULATION/'	
	reformulation_df.to_csv(f'{outdir}/reformulation_{epoch}.csv',sep='\t')
	print(f'Reformulations pour époque {epoch} générées')

print("finito")

Entraînement époque 0


KeyboardInterrupt: ignored