## Fine Tuning Transformers on SST2
https://www.assemblyai.com/blog/fine-tuning-transformers-for-nlp/

In [1]:
#Libraries needed
import torch
from torch.utils.data import Dataset

#PyTorch dataset class
class  SST_Dataset(Dataset):
	#Name: 		__init__
	#Purpose: 	init function to load the dataset
	#Inputs: 	dataset -> dataset
	#Outputs: 	none
	def  __init__(self, dataset):
		self.dataset = dataset
		return

	#Name: 		__len__
	#Purpose: 	get the length of the dataset
	#Inputs: 	none
	#Outputs: 	length -> length of the dataset
	def  __len__(self):
		return  len(self.dataset)

	#Name: 		__getitem__
	#Purpose: 	get a random text segment and its label from the dataset
	#Inputs: 	idx -> index of the random text segment to load
	#Outputs: 	text -> text segment
	# 			label -> sentiment score
	def  __getitem__(self, idx):
		text =  self.dataset[idx]['sentence']
		label = torch.zeros(2)
		label[round(self.dataset[idx]['label'])] =  1
		return text, label

In [2]:
#Name: 		get_gpu
#Purpose: 	checks if a GPU device is avaliable
#Input: 	none
#Output: 	GPU -> GPU device if applicable, none if not
def  get_gpu():
	#Check if a GPU is avaliable and if so return it
	GPU  =  None
	if torch.cuda.is_available():
		print("Using GPU")
		GPU  = torch.device("cuda")
	else:
		print("No GPU device avaliable! Using CPU")
	return  GPU

#Name: 		transfer_device
#Purpose: 	transfers model / data to the GPU devie if present
#Inputs: 	GPU -> GPU device if applicable, none if not
# 		 	data -> data to transfer
#Output: 	data -> data that has been transferred if applicable
def  transfer_device(GPU, data):
	if(GPU  !=  None):
		data = data.to(GPU)
	return data

#Name: 		count_correct
#Purpose: 	count the number of correct model predictions in a batch
#Inputs: 	predictions -> model predictions
#		 	targets -> target labels
#Outputs: 	correct -> number of correct model predictions
def  count_correct(predictions, targets):
	#Create variables to store the number of correct predictions along with the index of the prediction in the batch
	correct =  0
	index =  0
  
	#Loop across all predictions in the batch and count the number correct
	while(index <  len(predictions)):
		#Convert the prediction and target to lists
		prediction =  list(predictions[index])
		target =  list(targets[index])
  
		#Get the max index indicating the truth value from the prediction and target
		prediction_index = prediction.index(max(prediction))
		target_index = target.index(max(target))
  
		#If the max indices are the same increment correct
		if(prediction_index == target_index):
			correct +=  1
		index +=  1
	return correct

In [3]:
#Name: 		binary_cross_entropy
#Purpose: 	defines binary cross entropy loss function
#Inputs: 	predictions -> model predictions
# 			targets -> target labels
#Outputs: 	loss -> loss value
def  binary_cross_entropy(predictions, targets):
	loss =  -(targets * torch.log(predictions) + (1  - targets) * torch.log(1  - predictions))
	loss = torch.mean(loss)
	return loss

In [4]:
import torch.nn.functional as F 

#Name: 		train_model
#Purpose: 	train the model while evaluating its performance
#Inputs: 	GPU -> GPU device to train / evaluate on
# 			train_dataloader -> training set dataloader
# 			dev_dataloader -> development set dataloader
# 			tokenizer -> text tokenizer for model
# 			model -> model to train / evaluate
# 			optimizer -> optimizer to use to update model parameters
# 			criterion -> criterion to use to compute loss values
#Outputs: 	model -> model after training
def  train_model(GPU, train_dataloader, dev_dataloader, tokenizer, model, optimizer, criterion):
	#Evaluate the performance of the model before training
	valid_loss, valid_accuracy = evaluate(GPU, dev_dataloader, tokenizer, model, criterion)
	print("Pre-training validation loss: "+str(valid_loss)+" --- Accuracy: "+str(valid_accuracy))
	print()

	#Train the model across 3 epochs and evaluate its performance
	for epoch in  range(3):
		model, train_loss, train_accuracy = train(GPU, train_dataloader, tokenizer, model, optimizer, criterion)
		valid_loss, valid_accuracy = evaluate(GPU, dev_dataloader, tokenizer, model, criterion)

		#Print performance stats
		print(" ", end="\r")
		print("Epoch: "+str(epoch+1))
		print("Training loss: "+str(train_loss)+" --- Accuracy: "+str(train_accuracy))
		print("Validation loss: "+str(valid_loss)+" --- Accuracy: "+str(valid_accuracy))
		print()
	return model

In [5]:
#Name: 		train
#Purpose: 	train the model over 1 epoch
#Inputs: 	GPU -> GPU device to train on
# 			dataloader -> dataloader
# 			tokenizer -> text tokenizer for model
# 			model -> model to train
# 			optimizer -> optimizer to use to update model parameters
# 			criterion -> criterion to use to compute loss values
#Outputs: 	model -> model after training over the epoch
# 			average_loss -> average loss over the epoch
# 			accuracy -> accuracy over the epoch
def  train(GPU, dataloader, tokenizer, model, optimizer, criterion):
	#Place the network in training mode, create a variable to store the total loss, and create a variable to store the total number of correct predictions
	model.train()
	total_loss =  0
	total_correct =  0
  
	#Loop through all batches in the dataloader
	for batch_number, (texts, labels) in  enumerate(dataloader):
		#Tokenize the text segments, get the model predictions, compute the loss, and add the loss to the total loss
		tokenized_segments = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
		tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
		model_predictions = F.softmax(model(input_ids=transfer_device(GPU, tokenized_segments_input_ids), attention_mask=transfer_device(GPU, tokenized_segments_attention_mask))['logits'], dim=1)
		loss = criterion(model_predictions, transfer_device(GPU, labels))
		total_loss += loss.item()
  
		#Count the number of correct predictions by the model in the batch and add this to the total correct
		correct = count_correct(model_predictions.cpu().detach().numpy(), labels.numpy())
		total_correct += correct
  
		#Zero the optimizer, compute the gradients, and update the model parameters
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
		print("Training batch index: "+str(batch_number)+"/"+str(len(dataloader))+  " ( "+str(batch_number/len(dataloader)*100)+"% )", end='\r')
  
	#Compute the average loss and accuracy across the epoch
	average_loss = total_loss /  len(dataloader)
	accuracy = total_correct / dataloader.dataset.__len__()
	return model, average_loss, accuracy

In [6]:
#Name: 		evaluate
#Purpose: 	evaluate the model over 1 epoch
#Inputs: 	GPU -> GPU device to evaluate on
# 			dataloader -> dataloader
# 			tokenizer -> text tokenizer for model
# 			model -> model to evaluate
# 			criterion -> criterion to use to compute loss values
#Outputs: 	average_loss -> average loss over the epoch
# 			accuracy -> accuracy over the epoch
def  evaluate(GPU, dataloader, tokenizer, model, criterion):
	#Place the network in evaluation mode, create a variable to store the total loss, and create a variable to store the total number of correct predictions
	model.eval()
	total_loss =  0
	total_correct =  0
  
	#Loop through all batches in the dataloader
	for batch_number, (texts, labels) in  enumerate(dataloader):
		#Tokenize the text segments, get the model predictions, compute the loss, and add the loss to the total loss
		tokenized_segments = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
		tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
		model_predictions = F.softmax(model(input_ids=transfer_device(GPU, tokenized_segments_input_ids), attention_mask=transfer_device(GPU, tokenized_segments_attention_mask))['logits'], dim=1)
		loss = criterion(model_predictions, transfer_device(GPU, labels))
		total_loss += loss.item()
  
		#Count the number of correct predictions by the model in the batch and add this to the total correct
		correct = count_correct(model_predictions.cpu().detach().numpy(), labels.numpy())
		total_correct += correct
		print("Evaluation batch index: "+str(batch_number)+"/"+str(len(dataloader))+  " ( "+str(batch_number/len(dataloader)*100)+"% )", end='\r')
  
	#Compute the average loss and accuracy across the epoch
	average_loss = total_loss /  len(dataloader)
	accuracy = total_correct / dataloader.dataset.__len__()
	return average_loss, accuracy

In [7]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

#Get the GPU device if it exists, load the SST-2 dataset, and create PyTorch datasets and dataloaders for the training and validation sets
GPU  = get_gpu()
sst2_dataset = load_dataset("sst", "default")
train_dataset = SST_Dataset(sst2_dataset['train'])
valid_dataset = SST_Dataset(sst2_dataset['validation'])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=4)
  
#Create the tokenizer, model, optimizer, and criterion
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = transfer_device(GPU, DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased'))
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = binary_cross_entropy
  
#Train and save the model
model = train_model(GPU, train_dataloader, valid_dataloader, tokenizer, model, optimizer, criterion)
smodel = str(model)
torch.save({'tokenizer': tokenizer, 'model_state_dict': model.state_dict()}, smodel[:10]+".pt")

2022-11-16 13:09:34.209281: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-16 13:09:37.916037: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /share/apps/rc/software/CUDA/9.2.88-GCC-7.3.0-2.30/extras/CUPTI/lib64:/share/apps/rc/software/CUDA/9.2.88-GCC-7.3.0-2.30/lib64:/share/apps/rc/software/binutils/2.30-GCCcore-7.3.0/lib:/share/apps/rc/software/GCCcore/7.3.0/lib64:/share/apps/rc/software/GCCcore/7.3.0/lib:/cm/shared/apps/cuda92/toolkit/9.2.88/extras/CUPTI/lib64:/cm/local/apps/cuda/libs/current/lib64:/cm/shared/apps/cuda92/toolkit/9.2.88/targets

Using GPU


Found cached dataset sst (/home/rcorkil2/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

  cpuset_checked))
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.b

Pre-training validation loss: 0.6963417189461845 --- Accuracy: 0.49227974568574023

Epoch: 1on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.44068055050203414 --- Accuracy: 0.7954119850187266
Validation loss: 0.35872606933116913 --- Accuracy: 0.8356039963669392

Epoch: 2on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.27489311289921237 --- Accuracy: 0.8934925093632958
Validation loss: 0.3748814071927752 --- Accuracy: 0.8501362397820164

Epoch: 3on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.1621928099500999 --- Accuracy: 0.9400749063670412
Validation loss: 0.4955283475773675 --- Accuracy: 0.8501362397820164



In [8]:
from transformers import BertTokenizer, BertForSequenceClassification

#Get the GPU device if it exists, load the SST-2 dataset, and create PyTorch datasets and dataloaders for the training and validation sets
GPU  = get_gpu()
sst2_dataset = load_dataset("sst", "default")
train_dataset = SST_Dataset(sst2_dataset['train'])
valid_dataset = SST_Dataset(sst2_dataset['validation'])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=4)
  
#Create the tokenizer, model, optimizer, and criterion
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = transfer_device(GPU, BertForSequenceClassification.from_pretrained('bert-large-uncased'))
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = binary_cross_entropy
  
#Train and save the model
model = train_model(GPU, train_dataloader, valid_dataloader, tokenizer, model, optimizer, criterion)
smodel = str(model)
torch.save({'tokenizer': tokenizer, 'model_state_dict': model.state_dict()}, smodel[:10]+".pt")

Using GPU


Found cached dataset sst (/home/rcorkil2/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Pre-training validation loss: 0.8877063035964966 --- Accuracy: 0.4950045413260672

Epoch: 1on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.37751751239603376 --- Accuracy: 0.8383661048689138
Validation loss: 0.32351926096848077 --- Accuracy: 0.8810172570390554

Epoch: 2on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.2250990467739016 --- Accuracy: 0.9197097378277154
Validation loss: 0.31253974565437864 --- Accuracy: 0.8819255222524978

Epoch: 3on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.12709649103564605 --- Accuracy: 0.959503745318352
Validation loss: 0.3854422716157777 --- Accuracy: 0.8773841961852861



In [11]:
from transformers import DebertaConfig, DebertaModel, DebertaTokenizer, DebertaForSequenceClassification

#Get the GPU device if it exists, load the SST-2 dataset, and create PyTorch datasets and dataloaders for the training and validation sets
GPU  = get_gpu()
sst2_dataset = load_dataset("sst", "default")
train_dataset = SST_Dataset(sst2_dataset['train'])
valid_dataset = SST_Dataset(sst2_dataset['validation'])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=4)
  
#Create the tokenizer, model, optimizer, and criterion
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = transfer_device(GPU, DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base"))
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = binary_cross_entropy
  
#Train and save the model
model = train_model(GPU, train_dataloader, valid_dataloader, tokenizer, model, optimizer, criterion)
smodel = str(model)
torch.save({'tokenizer': tokenizer, 'model_state_dict': model.state_dict()}, smodel[:10]+".pt")

Using GPU


Found cached dataset sst (/home/rcorkil2/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight', 'classi

Pre-training validation loss: 0.7031167081424168 --- Accuracy: 0.49318801089918257

Epoch: 1on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.4580510814538163 --- Accuracy: 0.7814840823970037
Validation loss: 0.2872488098485129 --- Accuracy: 0.8864668483197093

Epoch: 2on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.24913639965910145 --- Accuracy: 0.9039091760299626
Validation loss: 0.28509851566382816 --- Accuracy: 0.8910081743869209

Epoch: 3on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.15288463678587688 --- Accuracy: 0.9441713483146067
Validation loss: 0.3488712762083326 --- Accuracy: 0.8837420526793823

