In [3]:
import numpy as np
import pandas as pd



from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import transformers


import warnings
import time
warnings.filterwarnings('ignore')


In [6]:
sst2_path = '../../data/SST-2'

pre_trained_weights_type = 'distilbert-base-uncased'

In [7]:
sst2_train = pd.read_csv(sst2_path + '/train.tsv', delimiter='\t')

In [8]:
sst2_train.shape

(67349, 2)

In [9]:
sst2_train.head(10)

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0
5,that 's far too tragic to merit such superfici...,0
6,demonstrates that the director of such hollywo...,1
7,of saucy,1
8,a depressed fifteen-year-old 's suicidal poetry,0
9,are more deeply thought through than in most `...,1


In [10]:
# Balance of Binary Classification Data 
sst2_train['label'].value_counts()

1    37569
0    29780
Name: label, dtype: int64

### Import Model Class , tokenizer Class and Pretrained Weights for Fine Tuning 

In [11]:
from transformers import DistilBertModel , DistilBertTokenizer

In [12]:
# Setting pre_trained weights 

pre_trained_weights = pre_trained_weights_type 

# Call the Model 
distilbert_model = DistilBertModel.from_pretrained(pre_trained_weights)

# Call the Tokenizer 

distilbert_tokenizer = DistilBertTokenizer.from_pretrained(pre_trained_weights)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

### https://huggingface.co/docs/transformers/training

In [91]:
#Libraries needed
import torch
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [92]:
#Get the GPU device if it exists, load the SST-2 dataset, and create PyTorch datasets and dataloaders for the training and validation sets
GPU  = get_gpu()

def get_sst_examples(input_file, test=False):

    train_examples = []
    test_examples = []

    with open(input_file, 'r') as f:

        contents = f.read()
        file_as_list = contents.splitlines()
        for line in file_as_list[1:]:
            
            # random drop 90% of examples for checking
            is_dropped = np.random.binomial(1, 0.6, 1)[0]
            
            if not test and is_dropped == 1:
                continue
                
                
            text, label = line.split("\t") 
            if test:
                test_examples.append((text, label))
            else : 
                train_examples.append((text, label))
        f.close()

    return train_examples, test_examples





Using GPU


In [93]:
labeled_examples, _ = get_sst_examples('./../../data/SST-2/train.tsv')
_, test_examples = get_sst_examples('./../../data/SST-2/dev.tsv', test=True)

In [94]:
labeled_examples[0]

('remains utterly satisfied to remain the same throughout ', '0')

In [102]:
def generate_data_loader(input_examples, label_map,batch_size=64, do_shuffle = False, balance_label_examples = False):
    '''
    Generate a Dataloader given the input examples, eventually masked if they are 
    to be considered NOT labeled.
    '''

    #-----------------------------------------------
    # Generate input examples to the Transformer
    #-----------------------------------------------
    input_ids = []
    input_mask_array = []
    label_id_array = []

    # Tokenization 
    for text in input_examples:
        # each sentence is tokenized and converted into an ID from the vocabulary
        
        encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=64, padding="max_length", truncation=True)
        
        input_ids.append(encoded_sent)
        
        label_id_array.append(label_map[text[1]])
         
    # input_ids ---> contains a list of list of all word embeddings for the sentence 
    # label_id_array ---> contains a list of actual labels which can be (0, 1, 'UNK')
  
    # Attention to token (to ignore padded input wordpieces)
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]                          
        input_mask_array.append(att_mask)
    # Convertion to Tensor
    
    
    input_ids = torch.tensor(input_ids) 
    input_mask_array = torch.tensor(input_mask_array)
    label_id_array = torch.tensor(label_id_array, dtype=torch.long)
    
    # Building the TensorDataset
    dataset = TensorDataset(input_ids, input_mask_array, label_id_array)
    
    if do_shuffle:
        sampler = RandomSampler
    else:
        sampler = SequentialSampler

    # Building the DataLoader
    return DataLoader(
                dataset,  # The training samples.
                sampler = sampler(dataset), 
                batch_size = batch_size) # Trains with this batch size.


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [103]:
label_map = {'0': 0, '1': 1}
train_examples = labeled_examples

In [104]:
train_dataloader = generate_data_loader(train_examples, label_map,batch_size =64, do_shuffle = True)

In [105]:

test_dataloader = generate_data_loader(test_examples, label_map)

In [15]:
#Name: 		get_gpu
#Purpose: 	checks if a GPU device is avaliable
#Input: 	none
#Output: 	GPU -> GPU device if applicable, none if not
def  get_gpu():
    #Check if a GPU is avaliable and if so return it
    GPU  =  None
    if torch.cuda.is_available():
        print("Using GPU")
        GPU  = torch.device("cuda")
    else:
        print("No GPU device avaliable! Using CPU")
    return  GPU

#Name: 		transfer_device
#Purpose: 	transfers model / data to the GPU devie if present
#Inputs: 	GPU -> GPU device if applicable, none if not
# 		 	data -> data to transfer
#Output: 	data -> data that has been transferred if applicable
def  transfer_device(GPU, data):
    if(GPU  !=  None):
        data = data.to(GPU)
    return data

#Name: 		count_correct
#Purpose: 	count the number of correct model predictions in a batch
#Inputs: 	predictions -> model predictions
#		 	targets -> target labels
#Outputs: 	correct -> number of correct model predictions
def  count_correct(predictions, targets):
	#Create variables to store the number of correct predictions along with the index of the prediction in the batch
    correct =  0
    index =  0
  
	#Loop across all predictions in the batch and count the number correct
    while(index <  len(predictions)):
        #Convert the prediction and target to lists
        prediction =  list(predictions[index])
        target =  list(targets[index])

        #Get the max index indicating the truth value from the prediction and target
        prediction_index = prediction.index(max(prediction))
        target_index = target.index(max(target))

        #If the max indices are the same increment correct
        if(prediction_index == target_index):
            correct +=  1
        index +=  1
    return correct

In [17]:
get_gpu()

Using GPU


device(type='cuda')

In [18]:
def  binary_cross_entropy(predictions, targets):
    loss =  -(targets * torch.log(predictions) + (1  - targets) * torch.log(1  - predictions))
    loss = torch.mean(loss)
    return loss

In [30]:
import torch.nn.functional as F 

#Name: 		train_model
#Purpose: 	train the model while evaluating its performance
#Inputs: 	GPU -> GPU device to train / evaluate on
# 			train_dataloader -> training set dataloader
# 			dev_dataloader -> development set dataloader
# 			tokenizer -> text tokenizer for model
# 			model -> model to train / evaluate
# 			optimizer -> optimizer to use to update model parameters
# 			criterion -> criterion to use to compute loss values
#Outputs: 	model -> model after training

def  train_model(GPU, train_dataloader, dev_dataloader, tokenizer, model, optimizer, criterion,epochs):
    #Evaluate the performance of the model before training
    valid_loss, valid_accuracy = evaluate(GPU, dev_dataloader, tokenizer, model, criterion)
    print("Pre-training validation loss: "+str(valid_loss)+" --- Accuracy: "+str(valid_accuracy))
    print()

    #Train the model across 3 epochs and evaluate its performance
    for epoch in  range(epochs):
        model, train_loss, train_accuracy = train(GPU, train_dataloader, tokenizer, model, optimizer, criterion)
        valid_loss, valid_accuracy = evaluate(GPU, dev_dataloader, tokenizer, model, criterion)

        #Print performance stats
        print(" ", end="\r")
        print("Epoch: "+str(epoch+1))
        print("Training loss: "+str(train_loss)+" --- Accuracy: "+str(train_accuracy))
        print("Validation loss: "+str(valid_loss)+" --- Accuracy: "+str(valid_accuracy))
        print()
    return model

In [31]:
def  train(GPU, dataloader, tokenizer, model, optimizer, criterion):
    #Place the network in training mode, create a variable to store the total loss, and create a variable to store the total number of correct predictions
    model.train()
    total_loss =  0
    total_correct =  0

    #Loop through all batches in the dataloader
    for batch_number, (texts, labels) in  enumerate(dataloader):
        #Tokenize the text segments, get the model predictions, compute the loss, and add the loss to the total loss
        tokenized_segments = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
        model_predictions = F.softmax(model(input_ids=transfer_device(GPU, tokenized_segments_input_ids), attention_mask=transfer_device(GPU, tokenized_segments_attention_mask))['logits'], dim=1)
        loss = criterion(model_predictions, transfer_device(GPU, labels))
        total_loss += loss.item()

        #Count the number of correct predictions by the model in the batch and add this to the total correct
        correct = count_correct(model_predictions.cpu().detach().numpy(), labels.numpy())
        total_correct += correct

        #Zero the optimizer, compute the gradients, and update the model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print("Training batch index: "+str(batch_number)+"/"+str(len(dataloader))+  " ( "+str(batch_number/len(dataloader)*100)+"% )", end='\r')

    #Compute the average loss and accuracy across the epoch
    average_loss = total_loss /  len(dataloader)
    accuracy = total_correct / dataloader.dataset.__len__()
    return model, average_loss, accuracy

In [32]:
def  evaluate(GPU, dataloader, tokenizer, model, criterion):
    #Place the network in evaluation mode, create a variable to store the total loss, and create a variable to store the total number of correct predictions
    model.eval()
    total_loss =  0
    total_correct =  0

    #Loop through all batches in the dataloader
    for batch_number,(input_ids, input_mask_array, label_id_array) in  enumerate(dataloader):
        #Tokenize the text segments, get the model predictions, compute the loss, and add the loss to the total loss
        tokenized_segments = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
        model_predictions = F.softmax(model(input_ids=transfer_device(GPU, tokenized_segments_input_ids), attention_mask=transfer_device(GPU, tokenized_segments_attention_mask))['logits'], dim=1)
        loss = criterion(model_predictions, transfer_device(GPU, labels))
        total_loss += loss.item()

        #Count the number of correct predictions by the model in the batch and add this to the total correct
        correct = count_correct(model_predictions.cpu().detach().numpy(), labels.numpy())
        total_correct += correct
        print("Evaluation batch index: "+str(batch_number)+"/"+str(len(dataloader))+  " ( "+str(batch_number/len(dataloader)*100)+"% )", end='\r')

    #Compute the average loss and accuracy across the epoch
    average_loss = total_loss /  len(dataloader)
    accuracy = total_correct / dataloader.dataset.__len__()
    return average_loss, accuracy

In [33]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [49]:
SST_Dataset

type

In [47]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=4)


for batch_number, (texts, labels) in enumerate(train_dataloader): 
    print(labels.shape)

torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])
torch.Size([32, 2])


In [36]:
#Create the tokenizer, model, optimizer, and criterion
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')





model = transfer_device(GPU, DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased'))



optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = binary_cross_entropy

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifi

In [38]:
#Train and save the model
model = train_model(GPU, train_dataloader, valid_dataloader, tokenizer, model, optimizer, criterion,epochs =10 )


Pre-training validation loss: 0.6919103366988045 --- Accuracy: 0.5068119891008175

Epoch: 1on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.4591036279884617 --- Accuracy: 0.78125
Validation loss: 0.3870589298861367 --- Accuracy: 0.8256130790190735

Epoch: 2on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.2944409636857358 --- Accuracy: 0.8788623595505618
Validation loss: 0.3795064274753843 --- Accuracy: 0.8374205267938238

Epoch: 3on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.16472508001388905 --- Accuracy: 0.9419475655430711
Validation loss: 0.5044310918876103 --- Accuracy: 0.8292461398728429

Epoch: 4on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.08229120714896906 --- Accuracy: 0.9736657303370787
Validation loss: 0.5399618868316923 --- Accuracy: 0.8346957311534968

Epoch: 5on batch index: 34/35 ( 97.14285714285714% ))
Training loss: 0.05374773938125104 --- Accuracy: 0.9836142322097379
Validation loss: 0.6238758955683027 --- 

In [None]:
torch.save({
    'tokenizer': tokenizer,
    'model_state_dict': model.state_dict()},
    model+".pt")
return