transformer model

In [None]:
!pip  install transformers==4.22.1 -q

In [None]:
import transformers

In [None]:
print(transformers.__version__)

In [5]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [6]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
# evaluate the performance of the Classification algorithm by calculating the percentage of its correct predictions.
# takes in 4 params and create a empty list, which will store the accuracy score of the prediction
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)
    # This code is used to calculate the accuracy of a prediction. 
    # It loops through each element in the true and predicted arrays, 
    # finds the intersection and union of the two sets, 
    # and then calculates the accuracy by taking the ratio of the intersection to the union. 
    # Finally, it returns the mean of all accuracy values.

Importing and Pre-Processing the domain data

In [8]:
#This code is reading in a CSV file located at the given path and storing it in the variable 'data'. 
data = pd.read_csv('/content/Claim_train_data.csv')

In [None]:
# showcase first 5 lines in the data set
data.head()

Preparing the Dataset and Dataloader

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
# This code sets the maximum length of text to be 128 characters, 
# the batch size for training and validation to 4, 
# the number of epochs to 1, the learning rate to 1, 
# and creates a tokenizer using DistilBertTokenizer from the pretrained 'distilbert-base-uncased' model,
#  with truncation enabled and lowercase enabled.

In [21]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.description = dataframe.description
        self.targets = self.data.claimType
        self.max_len = max_len

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        text = str(self.description[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
# This code creates a MultiLabelDataset object which is a subclass of the Dataset class. 
# It initializes the tokenizer, dataframe, description, targets and max_len variables. 
# The len method returns the length of the description list and the getitem method takes an index as an argument,
# and returns a dictionary containing ids, mask, token_type_ids and targets tensors

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=data.sample(frac=train_size,random_state=200)
test_data=data



print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
# This code is setting up a training and testing dataset. 
# The train_size variable is set to 0.8, which means that 80% of the data will be used for training and 20% for testing. 
# The data is then split into two datasets, train_data and test_data, using the sample() method with a random_state of 200. 
# Finally, the MultiLabelDataset() function is used to create two datasets for training and testing from the train_data and test_data variables respectively.

In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
# This code is setting up two DataLoader objects, one for training and one for testing. 
# The train_params and test_params dictionaries are used to set the batch size, 
# whether to shuffle the data, and the number of workers used to load the data. 
# The **train_params and **test_params syntax is used to unpack the dictionaries into keyword arguments.

Creating the Neural Network for Fine Tuning

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)
# This code creates a DistilBERTClass object which is a subclass of the torch.nn.Module class. 
# It initializes the DistilBERTModel from a pre-trained model, creates two linear layers, and a dropout layer. 
# The forward method takes in input_ids, attention_mask, and token_type_ids as parameters and returns an output after passing them through the layers of the model. 
# The model is then moved to the device specified by the user.

Loss Function and Optimizer

In [25]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
# This code defines a function called "loss_fn" which takes two arguments, "outputs" and "targets". 
# The function then uses the torch.nn.BCEWithLogitsLoss() method to calculate the binary cross entropy loss between the outputs and targets.

In [26]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
#This code is setting up an Adam optimizer for a model with the given parameters and learning rate. 
# The Adam optimizer is a type of gradient descent algorithm that helps to optimize the model's parameters for better performance.


Fine Tuning the Model

In [39]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
# This code is defining a training function that takes an epoch as an argument. 
# It then iterates through the training loader, assigning the ids, mask, token_type_ids and targets to the device. 
# It then calculates the output of the model using those values and calculates a loss using a loss function. 
# If _ is divisible by 5000, it prints out the epoch and loss. 
# Finally, it backpropagates the loss and updates the optimizer.

In [None]:
for epoch in range(EPOCHS):
    train(epoch)
# This code is looping through a range of epochs and running a train function for each epoch.

Validating the Model

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets
# This code is a validation function that uses the torch library to evaluate a model. 
# It takes in a testing loader as an argument and then iterates through it, 
# converting the data into the appropriate format for the model. 
# It then calculates the outputs and targets, which are stored in two separate lists. 
# Finally, it returns these two lists so that they can be used for further analysis.

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5
# This code is taking the outputs from the validation function and converting them into an array. 
# The final_outputs line is then comparing each value in the array to 0.5 and returning a boolean value (true or false) for each element in the array.

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")
# This code is calculating the Hamming score and Hamming loss of two arrays. 
# The Hamming score is a measure of similarity between two binary strings, 
# while the Hamming loss is the fraction of incorrect bits in the comparison. 
# The code first calculates the Hamming loss using the metrics.hamming_loss() function, 
# then calculates the Hamming score using the hamming_score() function, and finally prints out both values.

Saving the Trained Model for inference

In [None]:
# Saving the files for inference

output_model_file = ''
output_vocab_file = ''

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')
# This code is saving a model and its associated vocabulary to two separate files. 
# The first line sets the output model file name, and the second line sets the output vocabulary file name.
#  The torch.save() function saves the model to the output model file, and tokenizer.save_vocabulary() saves the vocabulary to the output vocabulary file. 
# Finally, the print statement prints out 'Saved' to indicate that the save was successful.