
### [Data Source](https://github.com/rohan-paul/YT_Data_Sources/blob/main/Amazon-Review-Text-NLP/amazon-review/cleaned_reviews.csv)

-----------------------

### Key parts of this Fine Tuning Strategies

1. Mean Pooling
2. Gradient Accumulation


In [1]:
!pip install --upgrade wandb

Collecting wandb
  Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.17.7
    Uninstalling wandb-0.17.7:
      Successfully uninstalled wandb-0.17.7
Successfully installed wandb-0.17.8


In [2]:
import os
import gc
import copy
import time
import random
import string
import joblib

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold

from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding

from colorama import Fore, Back, Style

b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


In [4]:
def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
    return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

HASH_NAME = id_generator(size=12)
print(HASH_NAME)

lhnt4h406hal


The function `id_generator` is designed to create a random string with a specified length (default is 12 characters) from a set of provided characters. The whole purpose is to generate a unique, random identifier.

`string.ascii_lowercase + string.digits` => all the lowercase ASCII characters (a-z) and digits (0-9)

`random.SystemRandom()` is a method that uses sources provided by the operating system (like /dev/urandom in Unix) to generate random numbers. It's a way of generating random numbers that's suitable for cryptographic use, as it's more secure than the standard random number generator in Python. choice(chars) then selects a single character randomly from the set chars.

`'.join(.... for _ in range(size))` The for loop inside the join function runs for size number of times (which is 12 by default) and in each iteration, it calls random.SystemRandom().choice(chars) to select a character. These characters are then joined together into a single string using the join function, which connects them without any additional characters in between. The result is a randomly generated string of a specified length.

In [5]:
ROOT_PATH = '/kaggle/input/cleaned-review-datasetsentiments/Cleaned_Review.csv' # Local Machine

In [6]:
CONFIG = {"seed": 2022,
          "epochs": 3,
          "model_name": "microsoft/deberta-v3-base",
          "train_batch_size": 8,
          "valid_batch_size": 16,
          "max_length": 512,
          "learning_rate": 1e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 3,
          "n_accumulate": 1,
          "num_classes": 3,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME,
          "competition": "amazon-reviews-dataset",
          "_wandb_kernel": "react",
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [7]:
def set_seeds(config):
    '''Sets the seed of the entire program so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    seed = config['seed']
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
    os.environ['PYTHONHASHSEED'] = str(seed)

# Usage
set_seeds(CONFIG)

In [8]:
df = pd.read_csv(ROOT_PATH)
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,sentiments,cleaned_review,cleaned_review_length,review_score
1,positive,i wish would have gotten one earlier love it a...,19,5.0
2,neutral,i ve learned this lesson again open the packag...,88,1.0
3,neutral,it is so slow and lags find better option,9,2.0
4,neutral,roller ball stopped working within months of m...,12,1.0


In [9]:
df = df.drop(index=0).reset_index(drop=True)

df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0


In [10]:
print('df.shape ', df.shape)
nan_count = df['cleaned_review'].isna().sum()

nan_count


df.shape  (17340, 4)


3

In [11]:
# df['cleaned_review'] = df['cleaned_review'].fillna(' ')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
# Without resetting, you will get error something like - KeyError: '[2085] not in index' 
# which indicates that the index of the DataFrame df does not include the value 2085. 
# This might be happening because the DataFrame df has fewer rows than that or rows have been dropped and the index is not reset.

print('df.shape ', df.shape)

nan_count_after = df['cleaned_review'].isna().sum()
nan_count_after

df.shape  (17337, 4)


0

<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> GroupKFold </h3>
</div>

## Whats the speciality of GroupKFold and where is it used

GroupKFold is particularly useful when dealing with data that have a group structure, and data leakage between the train and test sets may occur if the group structure is not taken into account.

### Data leakage is when information from outside the training dataset is used to train the model. This can easily happen when there's a group structure in your data, and some data from one group ends up in the training set, and some in the test set. The model might then learn about specific groups, rather than generalizing well to unseen data.

For example:

**Patient data in healthcare:** Here, multiple data points might be collected from the same patient. These data points could share certain characteristics that can influence a model. If we have data from the same patient in both the training and testing set, our model might perform overly well because it has indirectly seen the test data during training.

**Sentences or documents in NLP:** When working with text data, it is common to split documents into sentences or parts, each part as an individual data point. Here, if we have some sentences from the same document in the training set and some in the test set, our model might perform better than expected because sentences in the same document are often related.

**Time-series data:** For time-series data, the same subject (like a place or a person) can have multiple measurements at different points in time. If data from the same subject ends up in both training and testing sets, it can lead to overly optimistic performance estimates.

In these cases, using GroupKFold can help get a more accurate measure of a model's performance by making sure all data points from one group end up in either the training set or the test set, but not both. The model is then forced to learn more general patterns that apply to unseen groups, rather than specific patterns within a group.

### Now, I want to add a new column named "kfold" to your dataframe, where each row gets the fold number it belongs to.

In [12]:
# Assign group ids to unique reviews
# We create a numerical group identifier, as this dataframe does not have a unique identifier for each row
# This identifier will be created by assigning unique ids to unique reviews.
df['group'] = df['cleaned_review'].factorize()[0]


# Initialize GroupKFold
gkf = GroupKFold(n_splits=CONFIG['n_fold'])

# Apply group k-fold
for fold, (_, val_index) in enumerate(gkf.split(X=df, groups=df['group'])):
    df.loc[val_index, "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score,group,kfold
0,positive,i wish would have gotten one earlier love it a...,19,5.0,0,2
1,neutral,i ve learned this lesson again open the packag...,88,1.0,1,1
2,neutral,it is so slow and lags find better option,9,2.0,2,0
3,neutral,roller ball stopped working within months of m...,12,1.0,3,2
4,neutral,i like the color and size but it few days out ...,21,1.0,4,1



## `pandas.factorize()`

pandas.factorize() function is used to identify distinct values and assign them a numerical identifier. This can be useful in scenarios where we want to convert categorical data into a numerical format

It returns two outputs: a label array and an array with unique values. In simple terms, it assigns a unique integer to each unique value in the array.

```py

s = pd.Series(['cat', 'dog', 'cat', 'dog', 'bird', 'bird', 'cat'])

labels, uniques = pd.factorize(s)

print("Labels:", labels) # Labels: [0 1 0 1 2 2 0]
print("Uniques:", uniques) # Uniques: ['cat', 'dog', 'bird']

```

factorize() deals with NaN (missing values) as a distinct category. So if your series had any NaN values, they would be factorized into a unique number as well. If you don't want this behavior, you would have to handle missing values prior to calling factorize().

In our case above, `df['group'] = df['cleaned_review'].factorize()[0]`

- This is creating a new column called 'group' in the dataframe df, and assigning to it the numerical labels returned by factorize().

- So it creates a group identifier for each unique 'cleaned_review'. These group identifiers can then be used in the GroupKFold process to ensure that all entries from a particular unique review are either in the training set or the validation set, but not both.

===================================================



## Explanation of `for fold, (_, val_index) in enumerate(gkf.split(X=df, groups=df['group'])):`

gkf.split() returns a generator that produces indices that can be used to generate dataset splits.

The split() function takes three arguments, but in this case, we only provide two: X, which is the dataset to split, and groups, which is an array-like object that defines the groups within the data. 

**`gkf.split(X=df, groups=df['group'])` will return two lists of indices for each split**: the indices of the rows in the training set (which we're not using, hence the underscore _), and the indices of the rows in the validation set (val_index).

`enumerate()` is applied to the results from `gkf.split()`. It returns tuples where the first element is the count (starting from 0), and the second element is the value from the iterable. Here, the count corresponds to the fold number and is stored in fold.

===================================================



## Explanation of `df.loc[val_index, "kfold"] = int(fold)`

df.loc[val_index, "kfold"] selects rows and the "kfold" column in the DataFrame df. Here, val_index are the row indices for the validation set for the current fold, returned by gkf.split().

int(fold) converts the fold number (originally a float) to an integer. This is then assigned to the "kfold" column of the DataFrame at the positions specified by val_index.

So, essentially, this line is labeling the rows in the DataFrame that belong to the current fold's validation set with the fold number.

The significance of these lines is that they're assigning each row in the DataFrame to a fold number (0 through n_splits-1). These fold numbers can be used for K-Fold cross-validation, where in each iteration (or "fold"), a different subset of the data is held out for validation while the model is trained on the rest of the data. The grouping ensures that all samples with the same group value end up in either the training or the validation set, but not in both, preventing potential data leakage.

In [13]:
df.groupby('kfold')['sentiments'].value_counts()

kfold  sentiments
0      positive      3166
       neutral       2077
       negative       536
1      positive      3125
       neutral       2168
       negative       486
2      positive      3212
       neutral       2055
       negative       512
Name: count, dtype: int64

In [14]:
# Convert from categorical values ('positive', 'negative', 'neutral') to numerical values
encoder = LabelEncoder()
df['sentiments'] = encoder.fit_transform(df['sentiments'])

""" fit_transform() does two things: it first 'fits' the encoder by learning the mapping between classes and integer labels (using the fit method), and then it 'transforms' the input data into its encoded form (using the transform method).

So, fit_transform(df['sentiments']) is learning the mapping from 'positive', 'negative', and 'neutral' to integer labels, and then immediately applying this mapping to df['sentiments'] to produce a new numpy array of encoded labels.

So this above line, replaces the original 'sentiments' column in the dataframe with the newly encoded numpy array. Now, instead of 'positive', 'negative', and 'neutral', this column contains integer labels that represent these classes. """

with open("le.pkl", "wb") as fp:
    joblib.dump(encoder, fp)

In [15]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.cleaned_review = df['cleaned_review'].values
        self.targets = df['sentiments'].values
        
    def __len__(self):
        return len(self.df)
    
    """ The __getitem__ method essentially retrieves the data sample (inputs and target) at a specific index and processes it into a format that can be used by the BERT model.
    This method is particularly important because PyTorch's DataLoader class uses this method to create the batches during training and evaluation. When you pass your Dataset to a DataLoader and start iterating over it, the DataLoader will call the __getitem__ method to generate the necessary inputs for your model."""
    def __getitem__(self, index):
        cleaned_review = self.cleaned_review[index]
        text = self.tokenizer.sep_token + " " + cleaned_review
        # Above line appends the [SEP] token to the start of the review. This is a requirement of the BERT model, which uses the [SEP] token to differentiate between different sentences.
        
        # The preprocessed text is then tokenized using the encode_plus method of the tokenizer. The tokenized inputs are truncated to a maximum length of self.max_len and the special tokens [CLS] and [SEP] are added to the sequence.
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len
                    )
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

In [16]:
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Mean Pooling </h3>
</div>

## Theory behind `MeanPooling(nn.Module)` and its calculation in the context of Transformer NLP project

The process of converting a sequence of embeddings into a sentence embedding is called “pooling”. Intuitively, this entails compressing the granular token-level representations into a single fixed-length representation that is supposed to reflect the meaning of the entire sequence.

Simply put, The embeddings go through a pooling layer to get a single fixed-length embedding for all the text. For example, mean pooling averages the embeddings generated by the model.

![](2023-05-27-16-34-51.png)

The MeanPooling class applies mean pooling to the hidden states of a transformer model. **This operation is used to create a single vector representation for an entire input sequence**, which can be used for sequence classification tasks, such as sentiment analysis or spam detection.

## So, after applying MeanPooling, each sentence in the batch is represented by a single vector that is the average of the embeddings of its actual tokens. This can be fed into a classifier to predict, for example, the sentiment of the sentence.

In [17]:
class MeanPooling(nn.Module):
    """ The MeanPooling class inherits from the nn.Module class which is the base class for all neural network modules in PyTorch. """
    def __init__(self):
        super(MeanPooling, self).__init__()
        # In above line __init__() is called to initialize the nn.Module parent class.

    def forward(self, last_hidden_state, attention_mask):

        # attention_mask => (batch_size, sequence_length)
        # last_hidden_state represents the output of the transformer model, which is a 3D tensor of shape (batch_size, sequence_length, hidden_size).

        # First, the attention_mask is expanded to match the size of the last_hidden_state:
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() # => (batch_size, sequence_length, hidden_size).
        # The resulting tensor is of shape (batch_size, sequence_length, hidden_size).
        #  where each [PAD] token is represented by a vector of zeros, and all other tokens are represented by vectors of ones.

        # Then, the last_hidden_state is multiplied by the expanded mask to zero out the embeddings of the [PAD] tokens:
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        # This line computes the sum of the input_mask_expanded along the sequence_length dimension.
        # This sum represents the number of actual (non-padding) tokens in each sequence of the batch.

        # The sum of the mask values is then computed for each sentence:
        # Result: A 2D tensor of shape (batch_size, hidden_size), where each value represents the number of actual tokens (excluding padding tokens) in the corresponding sentence.
        sum_mask = input_mask_expanded.sum(1) #=> (batch_size, hidden_size)

        """ In above line, the sum function with argument 1 is called on input_mask_expanded to compute the sum along the sequence_length dimension. Essentially, this operation is adding up all the 1s for each sequence in the batch, which gives us the number of actual tokens (i.e., non-padding tokens) in each sequence.
        So, sum_mask is a 2D tensor of shape (batch_size, hidden_size), where each value represents the number of actual tokens in the corresponding sequence.

        This is a crucial step in calculating the mean embeddings for each sequence. By summing the mask values, we essentially count the number of valid (non-padding) tokens in each sequence. This count is later used as the denominator when calculating the mean (i.e., sum of token embeddings / number of tokens).

        By only considering non-padding tokens, we ensure the mean embeddings accurately represent the sequence, rather than being skewed by padding tokens that carry no meaningful information.
        """

        # a lower limit is set on the sum_mask values to avoid division by zero:
        # Result: The same tensor as sum_mask, but any value that was originally zero is now 1e-9.
        sum_mask = torch.clamp(sum_mask, min=1e-9)

        #Finally, the mean of the embeddings is computed by dividing the sum of the embeddings by the number of actual tokens:
        mean_embeddings = sum_embeddings / sum_mask

        # Result: A 2D tensor of shape (batch_size, hidden_size), representing the sentence-level embeddings computed as the mean of the token-level embeddings (ignoring padding tokens).
        return mean_embeddings

## `input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()`


last_hidden_state represents the output of the transformer model, which is a 3D tensor of shape **(batch_size, sequence_length, hidden_size)**. This tensor contains the embeddings of all tokens in all sequences of the batch.

The attention_mask is a tensor that represents the attention mask for the input sequence. It is a 2-dimensional tensor with a shape of (batch_size, sequence_length). The unsqueeze function is used to add an extra dimension to the tensor at the specified position (-1). This results in a tensor with a shape of (batch_size, sequence_length, 1).

Then the `expand(last_hidden_state.size())` then expands this tensor to match the size of the last_hidden_state tensor. The resulting tensor is thus of shape (batch_size, sequence_length, hidden_size).


float() is used to ensure that the expanded mask tensor is a floating point tensor, which is necessary for the upcoming multiplication operation

<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Create Model </h3>
</div>

In [18]:
class TextModel(nn.Module):
    def __init__(self, model_name):
        super(TextModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

### Why we need `nn.Linear(self.config.hidden_size, CONFIG['num_classes'])`

The purpose of this line is to create the final layer of the neural network model that will map the transformer's output vectors (of size hidden_size) to logits for each of the classes in the classification problem (of size num_classes).

Each logit can be thought of as a raw prediction score for a class. To turn these logits into probabilities, they can be passed through a softmax function. The class with the highest probability can be chosen as the model's prediction.

This linear layer is crucial because the transformer model by itself outputs feature vectors that represent the input text in a high-dimensional space, but does not perform any task-specific classification. The linear layer takes these high-dimensional representations and maps them to a space that corresponds to the classes we're trying to predict.

In summary, this line of code is defining the "decision-making" part of the model, where the "knowledge" learned and encoded by the transformer model is used to make a final prediction for the task at hand.

<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Loss Function </h3>
</div>

In [19]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)
""" nn.CrossEntropyLoss()(outputs, labels) is creating an instance of the nn.CrossEntropyLoss class and
immediately calling it as a function with the outputs and labels as arguments. """

' nn.CrossEntropyLoss()(outputs, labels) is creating an instance of the nn.CrossEntropyLoss class and\nimmediately calling it as a function with the outputs and labels as arguments. '

<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Training Function </h3>
</div>

In [20]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    """ The total argument in tqdm specifies the total number of iterations (or updates to the progress bar). In this case, len(dataloader) is used as the total which is the total number of batches in the dataloader. """
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets)
        
        """ Gradient accumulation involves accumulating gradients over multiple mini-batches before performing a weight update step. 
        And that Gradient accumulation over several forward passes is achieved through the following two lines in the train_one_epoch() function: """
        loss = loss / CONFIG['n_accumulate']
        """ The `backward()` call on the next line calculates the gradients of the loss with respect to model parameters. Importantly, these gradients are not removed after the computation, they remain stored in the .grad attributes of the model parameters.
        
        BUT Instead of updating the parameters right away, add the computed gradients to the accumulated gradients. This step is repeated for a specified number of mini-batches.   """
        loss.backward()
    
        # After accumulating gradients over the desired number of mini-batches, perform the weight update step. 
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            # performs the actual parameter update using the accumulated gradients.
            optimizer.step()

            #  clears out all the accumulated gradients from the parameters to prepare for the next round of accumulation. This happens after every CONFIG['n_accumulate'] batches, as checked by the if condition.
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

### Explanation for `train_one_epoch`


The dataloader object is responsible for serving batches of data. It also handles shuffling of the data and parallelism in the loading of the data onto the device (e.g., a GPU).

When enumerate is applied to dataloader, it generates pairs (step, data), where step is the count of batches that have been loaded so far and data is the batch data at the current step.

The output of **`tqdm(enumerate(dataloader), total=len(dataloader))`** is not a typical data structure like a list, dictionary, or DataFrame, which you can directly inspect or print out. Instead, it's an iterator wrapped with tqdm progress bar functionality.

When you use this iterator in a loop, like:


```
for step, data in tqdm(enumerate(dataloader), total=len(dataloader)):    

```


In each iteration, it outputs a tuple, (step, data), where:

**step**: is the index of the current batch (starts from 0 and goes up to len(dataloader) - 1).

**data**: is the actual content of the batch. This is typically a dictionary where each key is a name of a field in your dataset and the value is a tensor of batched values for that field. For instance, in your script, 'input_ids', 'attention_mask', and 'target' are expected keys in the data.

-------------------------

### Gradient accumulation - Explanation of line `if (step + 1) % CONFIG['n_accumulate'] == 0:`

To understand gradient accumulation, let's first review the standard training procedure for deep learning models. In the standard approach, a mini-batch of training samples is fed into the model, and the gradients of the model parameters with respect to the loss function are computed using backpropagation. Then, these gradients are used to update the model's parameters using an optimization algorithm, such as stochastic gradient descent (SGD) or Adam.

In gradient accumulation, instead of updating the model's parameters after each mini-batch, we accumulate gradients over multiple mini-batches before performing the weight update step.

**The main steps involved in gradient accumulation are as follows:**

Initialize the gradients: Before starting the training loop, the gradients for all model parameters are initialized to zero.

Accumulate gradients: For each mini-batch, compute the gradients ( with `loss.backward()` ) of the model parameters with respect to the loss function using backpropagation. 

BUT Instead of updating the parameters right away, add the computed gradients to the accumulated gradients. This step is repeated for a specified number of mini-batches.

Weight update step: After accumulating gradients over the desired number of mini-batches, perform the weight update step. With `optimizer.step()` This involves updating the model's parameters using the accumulated gradients. The update can be done using any optimization algorithm, such as SGD or Adam.

Reset gradients: After the weight update step, reset the accumulated gradients to zero to prepare for the next iteration. With `optimizer.zero_grad()`

------------------------------------

## In the above  `train_one_epoch()` method in which line exactly we sum these gradients over several forward passes ?

Gradient accumulation over several forward passes is achieved through the following two lines in the train_one_epoch() function:

```py
loss = loss / CONFIG['n_accumulate']
loss.backward()

```

In the first line, the loss for the current mini-batch is divided by CONFIG['n_accumulate']. This effectively scales down the gradient that will be computed in the next step. This is necessary because later we are summing (or rather, accumulating) CONFIG['n_accumulate'] of these gradients.

#### The `backward()` call on the next line calculates the gradients of the loss with respect to model parameters. Importantly, these gradients are not removed after the computation, they remain stored in the .grad attributes of the model parameters.

#### The key point is that every time backward() is called, gradients are computed and then added to whatever is currently stored in the .grad attributes of the parameters. Therefore, if we call backward() on loss / CONFIG['n_accumulate'] for CONFIG['n_accumulate'] mini-batches, the gradients stored in the .grad attributes of the parameters end up being the sum of the gradients for those mini-batches.


<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Validation Function </h3>
</div>

In [21]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss


<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Run Training </h3>
</div>


In [22]:
def run_training(model, optimizer, scheduler, train_loader, valid_loader, device, num_epochs, fold):
    """
    Train and validate a PyTorch model for a specified number of epochs.

    Parameters:
    model (torch.nn.Module): The model to train.
    optimizer (torch.optim.Optimizer): The optimizer for the model.
    scheduler (torch.optim.lr_scheduler): The learning rate scheduler.
    train_loader (torch.utils.data.DataLoader): The DataLoader for the training data.
    valid_loader (torch.utils.data.DataLoader): The DataLoader for the validation data.
    device (str): The device to train on ("cpu" or "cuda").
    num_epochs (int): The number of epochs to train for.
    fold (int): The fold number in k-fold cross-validation.

    Returns:
    model (torch.nn.Module): The trained model.
    history (dict): A dictionary containing the training and validation loss for each epoch.
    """
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    
    # Store the initial state of the model
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    
    # Store the loss for each epoch
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        # If the validation loss improved, save the model weights
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    
     # Print total training time and best validation loss
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [23]:
def get_dataloader(fold):
    """
    Returns train and validation data loaders for a given fold.

    Args:
        fold (int): The fold number for which to create the data loaders.

    Returns:
        Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]: The train and validation data loaders.
    """
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = TextDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = TextDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], collate_fn=collate_fn, 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], collate_fn=collate_fn,
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [24]:
from typing import Optional
from torch.optim import Optimizer
from torch.optim.lr_scheduler import (   
    _LRScheduler
)

def get_lr_scheduler(optimizer: Optimizer) -> Optional[_LRScheduler]:
    """
    Returns a learning rate scheduler based on the specified configuration.

    Args:
        optimizer (torch.optim.Optimizer): The optimizer for which to get the scheduler.

    Returns:
        torch.optim.lr_scheduler._LRScheduler or None: The learning rate scheduler or None if no scheduler is specified.
    """
    scheduler_type = CONFIG.get('scheduler')

    if scheduler_type == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG.get('T_max'), eta_min=CONFIG.get('min_lr'))
    
    elif scheduler_type == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=CONFIG.get('T_0'), eta_min=CONFIG.get('min_lr'))
    
    elif scheduler_type is None:
        return None
    
    else:
        raise ValueError(f"Invalid scheduler specified: {scheduler_type}")

    return scheduler


<div style="background: linear-gradient(45deg, #FFC300, #FF5733, #C70039, #900C3F); padding: 10px; border-radius: 5px; display: flex; align-items: center;">
    <h3 style="font-weight: bold; color: white; margin: 0 auto;"> Start Training </h3>
</div>



In [25]:
for fold in range(0, CONFIG['n_fold']):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    run = wandb.init(project='E_commerce_Review', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=[CONFIG['model_name'], f'{HASH_NAME}'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    
    train_loader, valid_loader = get_dataloader(fold=fold)
    
    model = TextModel(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = get_lr_scheduler(optimizer)
    
    model, history = run_training(model, optimizer, scheduler, train_loader, valid_loader,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold, )
    
    run.finish()
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 1444/1444 [05:54<00:00,  4.07it/s, Epoch=1, LR=1.28e-6, Train_Loss=0.486]
100%|██████████| 362/362 [00:38<00:00,  9.46it/s, Epoch=1, LR=1.28e-6, Valid_Loss=0.355]


[34mValidation Loss Improved (inf ---> 0.3551178701034357)
Model Saved[0m



100%|██████████| 1444/1444 [05:56<00:00,  4.05it/s, Epoch=2, LR=8.93e-6, Train_Loss=0.288]
100%|██████████| 362/362 [00:39<00:00,  9.21it/s, Epoch=2, LR=8.93e-6, Valid_Loss=0.332]


[34mValidation Loss Improved (0.3551178701034357 ---> 0.3316613201805034)
Model Saved[0m



100%|██████████| 1444/1444 [06:00<00:00,  4.01it/s, Epoch=3, LR=3.28e-6, Train_Loss=0.205]
100%|██████████| 362/362 [00:39<00:00,  9.28it/s, Epoch=3, LR=3.28e-6, Valid_Loss=0.284]


[34mValidation Loss Improved (0.3316613201805034 ---> 0.2840489044441713)
Model Saved[0m

Training complete in 0h 19m 56s
Best Loss: 0.2840


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Loss,█▃▁
Valid Loss,█▆▁

0,1
Best Loss,0.28405
Train Loss,0.20526
Valid Loss,0.28405


[34m[1mwandb[0m: Currently logged in as: [33manony-mouse-611990235697473960[0m. Use [1m`wandb login --relogin`[0m to force relogin





[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 1444/1444 [05:57<00:00,  4.03it/s, Epoch=1, LR=1.28e-6, Train_Loss=0.484]
100%|██████████| 362/362 [00:40<00:00,  8.99it/s, Epoch=1, LR=1.28e-6, Valid_Loss=0.384]


[34mValidation Loss Improved (inf ---> 0.38356510712293773)
Model Saved[0m



 37%|███▋      | 529/1444 [02:11<03:38,  4.18it/s, Epoch=2, LR=9.94e-6, Train_Loss=0.295]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 1444/1444 [05:56<00:00,  4.05it/s, Epoch=2, LR=8.93e-6, Train_Loss=0.283]
100%|██████████| 362/362 [00:40<00:00,  8.96it/s, Epoch=2, LR=8.93e-6, Valid_Loss=0.331]


[34mValidation Loss Improved (0.38356510712293773 ---> 0.33085977071053346)
Model Saved[0m



100%|██████████| 1444/1444 [05:58<00:00,  4.03it/s, Epoch=3, LR=3.28e-6, Train_Loss=0.215]
100%|██████████| 362/362 [00:40<00:00,  8.97it/s, Epoch=3, LR=3.28e-6, Valid_Loss=0.309]


[34mValidation Loss Improved (0.33085977071053346 ---> 0.3089650655558536)
Model Saved[0m

Training complete in 0h 20m 3s
Best Loss: 0.3090


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Loss,█▃▁
Valid Loss,█▃▁

0,1
Best Loss,0.30897
Train Loss,0.2154
Valid Loss,0.30897





[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 1444/1444 [05:57<00:00,  4.04it/s, Epoch=1, LR=1.28e-6, Train_Loss=0.486]
100%|██████████| 362/362 [00:40<00:00,  8.96it/s, Epoch=1, LR=1.28e-6, Valid_Loss=0.358]


[34mValidation Loss Improved (inf ---> 0.35839671244370025)
Model Saved[0m



100%|██████████| 1444/1444 [05:57<00:00,  4.04it/s, Epoch=2, LR=8.93e-6, Train_Loss=0.297]
100%|██████████| 362/362 [00:40<00:00,  8.96it/s, Epoch=2, LR=8.93e-6, Valid_Loss=0.3]  


[34mValidation Loss Improved (0.35839671244370025 ---> 0.30047603046673355)
Model Saved[0m



100%|██████████| 1444/1444 [05:58<00:00,  4.02it/s, Epoch=3, LR=3.28e-6, Train_Loss=0.215]
100%|██████████| 362/362 [00:40<00:00,  9.04it/s, Epoch=3, LR=3.28e-6, Valid_Loss=0.271]


[34mValidation Loss Improved (0.30047603046673355 ---> 0.2705906348886024)
Model Saved[0m

Training complete in 0h 20m 3s
Best Loss: 0.2706


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Train Loss,█▃▁
Valid Loss,█▃▁

0,1
Best Loss,0.27059
Train Loss,0.21494
Valid Loss,0.27059





In [34]:
model = TextModel(CONFIG['model_name'])

# Load the state dict from the .bin file
model.load_state_dict(torch.load("/kaggle/working/Loss-Fold-2.bin"))

# Save the entire model as .pkl
torch.save(model, "/kaggle/working/Fold2.pkl")

In [44]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

# Load the saved model
model_path = "/kaggle/working/Fold0.pkl"
model = torch.load(model_path)
model.to(CONFIG['device'])
model.eval()

def predict_sentiment(texts):
    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=CONFIG['max_length'], return_tensors="pt")
    input_ids = inputs['input_ids'].to(CONFIG['device'])
    attention_mask = inputs['attention_mask'].to(CONFIG['device'])
    
    # Forward pass through the model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs
        predictions = torch.argmax(logits, dim=1)
    
    return predictions

# Example usage
texts = ["This product is amazing!", "This is a bad item." , "This is an item"]
predictions = predict_sentiment(texts)

# Print predictions
print(predictions)

class_labels = ["Negative", "Neutral", "Positive"]

# Convert indices to labels
def interpret_predictions(predictions):
    return [class_labels[idx] for idx in predictions]

predicted_labels = interpret_predictions(predictions)

# Print predictions with labels
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text}\nPredicted Sentiment: {label}\n")

tensor([2, 0, 1], device='cuda:0')
Text: This product is amazing!
Predicted Sentiment: Positive

Text: This is a bad item.
Predicted Sentiment: Negative

Text: This is an item
Predicted Sentiment: Neutral



In [39]:
import joblib

# Load the LabelEncoder from the .pkl file
with open("le.pkl", "rb") as fp:
    encoder = joblib.load(fp)

# Retrieve class labels
class_labels = encoder.classes_

# Print class index to label mapping
print("Class index to label mapping:")
for idx, label in enumerate(class_labels):
    print(f"Index {idx}: {label}")


Class index to label mapping:
Index 0: negative
Index 1: neutral
Index 2: positive
