In [2]:
import numpy
import torch
import pandas as pd
import random
# Most important library for fine tuning
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from pprint import pprint

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)

set_seed()

## Import and EDA

#### Importing Dataset

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)

In [4]:
from datasets import load_dataset

data = load_dataset("stanfordnlp/imdb")

In [5]:
complete_data = pd.DataFrame(data['unsupervised'])
train_data = pd.DataFrame(data['train'])
test_data = pd.DataFrame(data['test'])
train_data.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [6]:
train_data.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
train_data.nunique()

text     24904
label        2
dtype: int64

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [9]:
train_data.describe()

Unnamed: 0,label
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


### Lets have a look into the random row

In [10]:
random_index = random.randint(0, len(train_data)-1)
pprint(train_data.iloc[random_index,0])

('Arguably this is a very good "sequel", better than the first live action '
 'film 101 Dalmatians. It has good dogs, good actors, good jokes and all right '
 'slapstick! <br /><br />Cruella DeVil, who has had some rather major therapy, '
 'is now a lover of dogs and very kind to them. Many, including Chloe Simon, '
 'owner of one of the dogs that Cruella once tried to kill, do not believe '
 'this. Others, like Kevin Shepherd (owner of 2nd Chance Dog Shelter) believe '
 'that she has changed. <br /><br />Meanwhile, Dipstick, with his mate, have '
 'given birth to three cute dalmatian puppies! Little Dipper, Domino and '
 'Oddball...<br /><br />Starring Eric Idle as Waddlesworth (the hilarious '
 'macaw), Glenn Close as Cruella herself and Gerard Depardieu as Le Pelt '
 '(another baddie, the name should give a clue), this is a good family film '
 'with excitement and lots more!! One downfall of this film is that is has a '
 'lot of painful slapstick, but not quite as excessive as the l

## Step:1 Importing the tokenizer from the hugging face.

In [11]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [12]:
tokenized_reviews = train_data['text'].apply(lambda x: tokenizer(x, add_special_tokens= True))
review_token_length = tokenized_reviews.apply(len)

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


In [13]:
print(f"Shorest token length: {review_token_length.min()}")
print(f"Max token length: {review_token_length.max()}")
print(f"Mean token length: {review_token_length.mean()}")

Shorest token length: 2
Max token length: 2
Mean token length: 2.0


# Note: Method to convert the pandas dataset to the hugging face dataset format.

In [14]:
from datasets import Dataset
hugging_face_fromat = Dataset.from_pandas(train_data)
print(hugging_face_fromat)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


## Step:2 Next we need to tokenize the data.

##### padding="longest": This setting ensures that all sequences in a batch are padded to match the length of the longest sequence within that batch.

#### truncation=True: This parameter ensures that any sequence exceeding the model's maximum allowable length is truncated to fit within that limit. Truncation prevents errors that can occur when input sequences are too long for the model to handle.

#### batched= True: This parameter ensures that we tokenize the text in batches.

#### batch_size= 10: This parameter ensures that our batch size of 10 rows at a time.

In [15]:
def tokenize_function(example):
    return tokenizer(example['text'], padding='longest', truncation=True)

tokenized_dataset = hugging_face_fromat.map(tokenize_function, batched=True, batch_size=1500)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

## Step 3: Calling the pretrained model from hugging face.

In [17]:
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Step 4: Passing the test dataset to check the accuracy before fine-tuning the model.

#### do_train = False: This parameter ensures that model is not in the training or fine-tuning mode.

#### do_eval = True: This parameter ensures that model is in evaluation mode. We can evaluate the model with our test data.

In [18]:
test_hugg_format = Dataset.from_pandas(test_data)
tokenized_test_dataset = test_hugg_format.map(tokenize_function, batched= True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
    output_dir = "./temp_results",  # Directory to save model checkpoints, logs, and other outputs

    # Training, Evaluation, and Prediction Flags
    do_train = False,  # Whether to perform training
    do_eval = True,  # Whether to perform evaluation on the validation set
    do_predict= False, # Indicates whether to perform predictions on the test set. If True, you can call the predict() method with your test dataset. 'predictions_output = trainer.predict(tokenized_test_dataset)'

    # Optimization Parameters
    learning_rate=5e-5, # The initial learning rate for the optimizer. Affects how quickly the model adapts to the problem.

    #Training Control Parameters
    num_train_epochs=3, # Total number of training epochs to perform. An epoch is one full pass through the training dataset.
    max_steps= 2, # If set to a positive number, training will stop after this many steps, overriding num_train_epochs.
    #per_device_train_batch_size=16, # Batch size per device (GPU/TPU/CPU) during training. Larger batch sizes can lead to faster training but require more memory.
    #per_device_eval_batch_size= 500, # Batch size per device (GPU/TPU/CPU) during evaluation. Larger batch sizes can lead to faster training but require more memory.
   
    # Logging 
    logging_dir="./logs",
    logging_steps=100,

    # Evalution 
    eval_strategy="steps",
    eval_steps= 100,
    
    seed = 40
)

trainer = Trainer(
    model = model,
    args = training_args,
    eval_dataset = tokenized_test_dataset
)

eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.6908232569694519, 'eval_model_preparation_time': 0.0022, 'eval_runtime': 345.3207, 'eval_samples_per_second': 72.396, 'eval_steps_per_second': 9.05}


# Some training configuration

# 🗂️ Output Configuration

- **`output_dir`** (`str`):  
  Specifies the directory where model checkpoints, logs, and other outputs will be saved.

- **`overwrite_output_dir`** (`bool`):  
  If set to `True`, the contents of the `output_dir` will be overwritten if it already exists.

---

# 🧪 Training, Evaluation, and Prediction Flags

- **`do_train`** (`bool`):  
  Indicates whether to perform training. If `True`, the `train()` method will execute the training loop.

- **`do_eval`** (`bool`):  
  Indicates whether to perform evaluation on the validation set. If `True`, evaluation is conducted during training at specified intervals.

- **`do_predict`** (`bool`):  
  Indicates whether to perform predictions on the test set. If `True`, you can call the `predict()` method with your test dataset.

---

# 🧮 Optimization Parameters

- **`learning_rate`** (`float`):  
  The initial learning rate for the optimizer. Affects how quickly the model adapts to the problem.

- **`weight_decay`** (`float`):  
  Weight decay (L2 penalty) to apply to the optimizer. Helps prevent overfitting by penalizing large weights.

- **`adam_beta1`** (`float`):  
  The beta1 parameter for the Adam optimizer, controlling the exponential decay rate for the first moment estimates.

- **`adam_beta2`** (`float`):  
  The beta2 parameter for the Adam optimizer, controlling the exponential decay rate for the second moment estimates.

- **`adam_epsilon`** (`float`):  
  A small constant for numerical stability in the Adam optimizer.

- **`max_grad_norm`** (`float`):  
  Maximum gradient norm for gradient clipping. Prevents exploding gradients by capping their norm.

---

# 📊 Training Control Parameters

- **`num_train_epochs`** (`float`):  
  Total number of training epochs to perform. An epoch is one full pass through the training dataset.

- **`max_steps`** (`int`):  
  If set to a positive number, training will stop after this many steps, overriding `num_train_epochs`.

- **`per_device_train_batch_size`** (`int`):  
  Batch size per device (GPU/TPU/CPU) during training. Larger batch sizes can lead to faster training but require more memory.

- **`per_device_eval_batch_size`** (`int`):  
  Batch size per device during evaluation.

- **`gradient_accumulation_steps`** (`int`):  
  Number of update steps to accumulate before performing a backward/update pass. Useful for training with large batch sizes that don't fit in memory.

---

# 📈 Evaluation and Logging

- **`eval_strategy`** (`str`):  
  Determines when to evaluate the model during training. Options:
  - `"no"`: No evaluation during training.
  - `"steps"`: Evaluate every `eval_steps`.
  - `"epoch"`: Evaluate at the end of each epoch.

- **`eval_steps`** (`int`):  
  Number of update steps between two evaluations if `evaluation_strategy` is set to `"steps"`.

- **`logging_dir`** (`str`):  
  Directory for storing logs.

- **`logging_strategy`** (`str`):  
  Determines when to log training metrics. Options:
  - `"no"`: No logging.
  - `"steps"`: Log every `logging_steps`.
  - `"epoch"`: Log at the end of each epoch.

- **`logging_steps`** (`int`):  
  Number of update steps between two logs if `logging_strategy` is set to `"steps"`.

- **`save_strategy`** (`str`):  
  Determines when to save model checkpoints. Options:
  - `"no"`: No saving.
  - `"steps"`: Save every `save_steps`.
  - `"epoch"`: Save at the end of each epoch.

- **`save_steps`** (`int`):  
  Number of update steps between two checkpoint saves if `save_strategy` is set to `"steps"`.

- **`save_total_limit`** (`int`):  
  Limits the total number of checkpoints. Older checkpoints are deleted when the limit is reached.

---

# ⚙️ Advanced Features

- **`fp16`** (`bool`):  
  Whether to use 16-bit (mixed) precision training. Can lead to faster training and reduced memory usage on compatible hardware.

- **`fp16_opt_level`** (`str`):  
  Optimization level for mixed precision training. Options: `"O0"`, `"O1"`, `"O2"`, `"O3"`.

- **`load_best_model_at_end`** (`bool`):  
  Whether to load the best model found during training at the end of training, based on the evaluation metric specified by `metric_for_best_model`.

- **`metric_for_best_model`** (`str`):  
  The metric to use to compare two different models.

- **`greater_is_better`** (`bool`):  
  Whether the `metric_for_best_model` should be maximized or minimized.

- **`report_to`** (`List[str]`):  
  The list of integrations to report the results and logs to. Supported platforms include `"tensorboard"`, `"wandb"`, `"comet_ml"`, etc.

---

# 🔧 Miscellaneous

- **`seed`** (`int`):  
  Random seed for reproducibility. Ensures that training results are consistent across runs.

- **`data_seed`** (`int`):  
  Random seed to be used with data samplers. If not set, it will use the value of `seed`.

- **`gradient_checkpointing`** (`bool`):  
  If `True`, use gradient checkpointing to save memory at the expense of a slower backward pass.

- **`label_smoothing_factor`** (`float`):  
  The label smoothing factor to use. Zero means no label smoothing.

- **`optim`** (`str`):  
  The optimizer to use. Options include `"adamw_torch"`, `"adamw_hf"`, `"adamw_torch_fused"`, etc.

- **`lr_scheduler_type`** (`str`):  
  The learning rate scheduler to use. Options include `"linear"`, `"cosine"`, `"polynomial"`, etc.

- **`warmup_steps`** (`int`):  
  Number of steps used for a linear warmup from 0 to `learning_rate`.

- **`warmup_ratio`** (`float`):  
  The ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
