In [1]:
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np

np.random.seed(42)

def load_imdb_dataset():
    imdb_dataset = load_dataset("imdb")
    N = 1000
    rand_idx = np.random.randint(24999, size=N)
    
    x_train = imdb_dataset['train'][rand_idx]['text']
    y_train = imdb_dataset['train'][rand_idx]['label']
    
    x_test = imdb_dataset['test'][rand_idx]['text']
    y_test = imdb_dataset['test'][rand_idx]['label']
    
    return DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'test':Dataset.from_dict({'label':y_test,'text':x_test})})    

dataset = load_imdb_dataset()

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [3]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.504)

## Model : TinyLlama-1.1B-Chatv1.0

In [4]:
import torch
from transformers import AutoModelForCausalLM

model_1 = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    num_labels = 2,
    torch_dtype = torch.float16,
    output_attentions=False,
    output_hidden_states=False
)

In [5]:
model_1

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [6]:
from transformers import AutoTokenizer

model_1_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

if model_1_tokenizer.pad_token is None:
    model_1_tokenizer.pad_token = model_1_tokenizer.eos_token

In [7]:
print(model_1_tokenizer.eos_token)

</s>


In [8]:
test = "Hi! How are you?"

In [9]:
encoded_text = model_1_tokenizer(test)["input_ids"]

In [10]:
encoded_text

[1, 6324, 29991, 1128, 526, 366, 29973]

In [11]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

import re

for i in range(len(train_texts)):
    train_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", train_texts[i])
    train_texts[i] = re.sub(r"\s+", " ", train_texts[i]).strip()
for i in range(len(test_texts)):
    test_texts[i] = re.sub(r"[^a-zA-Z0-9\s]", "", test_texts[i])
    test_texts[i] = re.sub(r"\s+", " ", test_texts[i]).strip()

In [12]:
def tokenize(texts):
    return model_1_tokenizer(
        texts, 
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize(train_texts)
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
train_labels = torch.tensor(train_labels)
    
# Tokenize test data
test_encodings = tokenize(test_texts)
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']
test_labels = torch.tensor(test_labels)

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=8
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    sampler=test_sampler,
    batch_size=8
)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [15]:
model_1.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [16]:
len(train_dataloader)

125

In [17]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 1
optimizer = AdamW(model_1.parameters(), lr=5e-5)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [18]:
from tqdm.notebook import tqdm
progress_bar = tqdm(range(total_steps))

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
            
    # Training
    model_1.train()
    total_train_loss = 0
    train_accuracy = 0
    nb_train_steps = 0
            
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
                
        model_1.zero_grad()
                
        outputs = model_1(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels
        )
                
        loss = outputs.loss
        total_train_loss += loss.item()
                
        # Calculate accuracy
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct = (preds == b_labels).sum().item()
        train_accuracy += correct / len(b_labels)
        nb_train_steps += 1
                
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_1.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
            
        # Calculate average loss and accuracy
        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_train_accuracy = train_accuracy / nb_train_steps
            
        print(f"\n  Training loss: {avg_train_loss:.4f}")
        print(f"  Training accuracy: {avg_train_accuracy:.4f}")
        progress_bar.update(1)

  0%|          | 0/125 [00:00<?, ?it/s]




OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.46 GiB is allocated by PyTorch, and 9.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)