In [139]:
import torch
import pandas as pd
import torch.nn.functional as F
import tiktoken
from dataclasses import dataclass
from transformers import GPT2Model, GPT2Tokenizer
import random
import torch.nn as nn

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'].replace('positive', 0, inplace=True)
df['sentiment'].replace('negative', 1, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace('positive', 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace('negative', 1, inplace=True)
  df['sentiment'].replace('negative', 1, inplace=True)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,0
1,A wonderful little production. <br /><br />The...,0
2,I thought this was a wonderful way to spend ti...,0
3,Basically there's a family where a little boy ...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",0


In [4]:
def pre_process_text(text):
    text = text.replace('<br />', "")
    text = text.strip()
    return text
df['review'] = df['review'].apply(pre_process_text)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,0
1,A wonderful little production. The filming tec...,0
2,I thought this was a wonderful way to spend ti...,0
3,Basically there's a family where a little boy ...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",0


In [140]:
@dataclass
class Config:
    block_size: int = 1024
    batch_size: int = 16
    model_name = 'gpt2'

In [82]:
def convert_text_to_tokens(model_name, block_size):
    encoder = GPT2Tokenizer.from_pretrained(model_name)
    tokens, labels = [], []
    for indx, row in df.iterrows():
        if not isinstance(row['review'], str):
            continue
        curr_tokens = encoder.encode(row['review'], max_length=block_size, add_special_tokens=True, truncation=True)
        if len(curr_tokens) > block_size: 
            curr_tokens = curr_tokens[:block_size]
        else: 
            curr_tokens = curr_tokens + [encoder.eos_token_id] * (block_size - len(curr_tokens))
        tokens.append(curr_tokens)
        labels.append(row['sentiment'])
    return torch.tensor(tokens, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

In [83]:
config = Config()
tokens, labels = convert_text_to_tokens(Config().model_name, config.block_size)

In [86]:
print(len(tokens))
# print(labels[:2])

50000


In [112]:
def train_test_val_split(train_ratio, val_ratio):
    test_ratio = 1 - train_ratio - val_ratio
    indices_map = {}
    train_indx, val_indx, test_indx = [], [], []
    for indx, label in enumerate(labels):
        label = label.item()
        indices_map.setdefault(label, []).append(indx)
    for key, val in indices_map.items():
        shuffled_labels = val.copy()
        random.shuffle(shuffled_labels)
        train_num = int(len(shuffled_labels) * train_ratio)
        val_num = int(len(shuffled_labels) * val_ratio)

        train_indx.extend(shuffled_labels[:train_num])
        val_indx.extend(shuffled_labels[train_num: train_num + val_num])
        test_indx.extend(shuffled_labels[train_num + val_num:])

    return tokens[train_indx], labels[train_indx], tokens[val_indx], labels[val_indx], tokens[test_indx], labels[test_indx]

In [113]:
train_data, train_labels, val_data, val_labels, test_data, test_labels = train_test_val_split(0.70, 0.20)

In [128]:
def shuffle_train_data(tr_data, tr_labels):
    # shuffle train data
    train_indices = torch.randperm(len(tr_data))
    return tr_data[train_indices], tr_labels[train_indices]
train_data, train_labels = shuffle_train_data(train_data, train_labels)
count_zeros = (train_labels == 0).sum().item()
count_ones = (train_labels == 1).sum().item()

print("Count of 0:", count_zeros)
print("Count of 1:", count_ones)

Count of 0: 17500
Count of 1: 17500


In [129]:
print(train_labels[:15])

tensor([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])


In [132]:
class dataLoaderLite:
    def __init__(self, split, config):
        self.config = config
        if split in 'train':
            self.data, self.labels = train_data, train_labels
        elif split in 'val': 
            self.data, self.lables = val_data, val_labels
        else:
            self.data, self.labels = test_data, test_labels
        self.current_indx = 0
    def next_batch(self):
        nxt_data, nxt_labels = self.data[self.current_indx: self.current_indx + self.config.batch_size], self.labels[self.current_indx: self.current_indx + self.config.batch_size]
        self.current_indx += self.config.batch_size
        if self.current_indx + self.config.batch_size > len(self.data):
            self.current_indx = 0
        return nxt_data, nxt_labels
    def reset(self):
        self.current_indx = 0

In [133]:
train_loader, val_loader, test_loader = dataLoaderLite('train', Config()), dataLoaderLite('val', Config()), dataLoaderLite('test', Config())

In [138]:
train_loader.reset()
val_loader.reset()
test_loader.reset()

In [145]:
class GPT2Classifier(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.gpt2_model = GPT2Model.from_pretrained(self.config.model_name)
        for param in self.gpt2_model.parameters():
            param.requires_grad = False
        for block in self.gpt2_model.h[-3:]:
            for param in block.parameters(): 
                param.requires_grad = True
        for param in self.gpt2_model.ln_f.parameters():
            param.requires_grad = True
        self.classifier = nn.Linear(self.gpt2_model.config.n_embd, 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x, attention_mask):
        outputs = self.gpt2_model(x, attention_mask)
        hidden_states = outputs.last_hidden_state 

        # Use hidden state at the last non-padding token (EOS-style pooling)
        seq_lengths = attention_mask.sum(dim=1) - 1
        pooled = hidden_states[torch.arange(input_ids.size(0), device=input_ids.device), seq_lengths]

        pooled = self.dropout(pooled)
        return self.classifier(pooled)

        

In [146]:
torch.set_float32_matmul_precision('high')
device = 'cpu'
if torch.cuda.is_available(): 
    device = 'cuda'
print(f"using device {device}")
raw_model = GPT2Classifier(Config())
raw_model.to(device)
model = torch.compile(raw_model)

using device cpu


In [147]:
max_lr = 2e-5
min_lr = max_lr * 0.01
warmup_steps = 200
max_steps = 2500
total_grad_steps = 1 << 16
weight_decay = 0.1
grad_accum_steps = total_grad_steps // (Config().block_size * Config().batch_size)

def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

In [148]:
def configure_optimizer(model, lr, weight_decay):
    param_group = {pn:p for pn, p in model.named_parameters()}
    param_group = {pn: p for pn, p in param_group.items() if p.requires_grad}

    decay_params = [p for pn, p in param_group.items() if p.dim() >= 2]
    non_decay_params = [p for pn, p in param_group.items() if p.dim() < 2]

    optim_group = [
        {"params": decay_params, "weight_decay": weight_decay}, 
        {"params": non_decay_params, "weight_decay": 0.0}
    ]
    fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    is_fused = "cuda" in device and fused_available
    optimizer = torch.optim.AdamW(optim_group, lr=lr, fused=is_fused)
    return optimizer