In [14]:
import urllib.request
import ssl
import zipfile
import os
import torch


from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists, skipping download and unzipping.')
        return

    # Download the file WITHOUT custom SSL context
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())

    # Unzip the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Rename the file to have .tsv extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f'File downloaded and saved as {data_file_path}')


# Run the function
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


sms_spam_collection\SMSSpamCollection.tsv already exists, skipping download and unzipping.


In [15]:
import pandas as pd

# Update the path if necessary
data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

# Load it into a DataFrame
df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

# Show first 5 rows
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [16]:
#creating a value counts to see the classifications
print(df["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [17]:
#we then proceed to make the dataset balance
# Load the dataset (adjust the path if needed)

data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

def create_balanced_dataset(df):
    # Count the number of spam messages
    num_spam = df[df["label"] == 'spam'].shape[0]

    # Randomly sample ham messages to match the number of spam messages
    ham_subset = df[df["label"] == "ham"].sample(num_spam, random_state=123)

    # Combine the ham subset with all spam messages
    balanced_df = pd.concat([ham_subset, df[df["label"] == "spam"]])

    return balanced_df

# Now create the balanced dataset
balanced_df = create_balanced_dataset(df)

# Check the class distribution
print(balanced_df['label'].value_counts())


label
ham     747
spam    747
Name: count, dtype: int64


In [18]:
#we then convert the string class labels ie ham and spam into 1 and 0 respectively
balanced_df['label'] = balanced_df['label'].map({'ham':0, 'spam':1})

In [19]:
#create a random split function to split the dataset into 3
#the 3 are training data, validation data and test data
#the ratio is usually 7.1.2
#or 70% to train, 10% to validate and 20% to test
def random_split(df, train_frac, validation_frac):
    #shuffle the entire dataset first
    df = df.sample(frac = 1, random_state = 123).reset_index(drop = True)

    #calc split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    #split the dataframe
    train_df = df[:train_end]
    validation_df = df[train_end : validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)


In [20]:
print(len(train_df))
print(len(validation_df))
print(len(test_df))

1045
149
300


In [21]:
#save the dataframes to csv file to reuse later
train_df.to_csv('train.csv', index = None)
validation_df.to_csv('validation.csv', index = None)
test_df.to_csv('test.csv', index = None)



In [22]:
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        # Drop rows where 'label' or 'message' is NaN
        self.data = self.data.dropna(subset=['label', 'message'])

        self.tokenizer = tokenizer
        self.pad_token_id = pad_token_id

        self.label2id = {'ham': 0, 'spam': 1}

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data['message']
        ]

        if max_length is None:
            self.max_length = max(len(x) for x in self.encoded_texts)
        else:
            self.max_length = max_length

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = int(self.data.iloc[index]['label'])  # Use directly if already 0/1
        return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

    def __len__(self):
        return len(self.data)


In [76]:
train_dataset = SpamDataset(csv_file='train.csv', tokenizer=tokenizer)

print(train_dataset.data['label'].isna().sum())     # Check how many missing labels
print(train_dataset.data['message'].isna().sum())   # Check how many missing messages
print(train_dataset.data['label'].unique())         # See label values (e.g., 'ham', 'spam')



0
0
[0 1]


In [27]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # since GPT-2 has no official padding token


#padding the train dataset
train_dataset = SpamDataset(
    csv_file = "train.csv",
    max_length = None,
    tokenizer = tokenizer
)

#padding the validation dataset
validation_dataset = SpamDataset(
    csv_file = "validation.csv",
    max_length = train_dataset.max_length,
    tokenizer = tokenizer
)

#padding the test dataset
test_dataset = SpamDataset(
    csv_file = "test.csv",
    max_length = train_dataset.max_length,
    tokenizer = tokenizer
)
print(train_dataset.max_length)
print(validation_dataset.max_length)
print(test_dataset.max_length)

120
120
120


In [28]:
import os
print(os.listdir())


['.bash_history', '.cache', '.conda', '.git', '.gitattributes', '.gitconfig', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.ms-ad', '.viminfo', '.vscode', '4.66', 'App', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'dataloaders.ipynb', 'Desktop', 'Documents', 'downloading_and_processing_dataset.ipynb', 'Downloads', 'Evaluating LLM Performance on Real Dataset.ipynb', 'Favorites', 'get_download.py', 'gpt2 weights saving and loading.ipynb', 'gpt2.ipynb.txt', 'gpt2model.pth', 'gpt2_dummy_weights.pth', 'gpt2_local_backup', 'gpt_download3.py', 'initializing_model_for_finetuning.ipynb', 'IntelGraphicsProfiles', 'Links', 'llm-1', 'loading_and_saving_openai_weights.ipynb', 'Local Settings', 'local_backup_loading_and_saving.ipynb', 'miniconda3', 'model_and_optimizer.pth', 'model_and_optimizer2.pth', 'Music', 'My Documents', 'NetHood', 'New folder', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{7376fb3e-2479-11f0-801d-80b6559a5fec}.TM.blf', 'NTUSER.DAT{7376f

In [29]:
import pandas as pd

df = pd.read_csv("train.csv")
print(df.columns)



Index(['label', 'message'], dtype='object')


In [30]:
#instantiating the dataloaders
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = torch.stack([item[1] for item in batch])
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    return inputs_padded, labels
    
num_workers = 0
batch_size = 8

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)

validation_loader = DataLoader(
    dataset = validation_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)


In [88]:
#to ensure that the dataloaders are working and are indeed returning batches of the expected size, we iterate over the training
#loader and the print the tensor dimensions of the last batch
print("Train Loader:")
for input_batch, target_batch in train_loader:
    pass

print("Input batch dimensions: ", input_batch.shape)
print("Label batch dimensions: ", target_batch.shape)

Train Loader:
Input batch dimensions:  torch.Size([5, 50])
Label batch dimensions:  torch.Size([5])


In [89]:
#print the no of batches in each dataset
print(f"{len(train_loader)} training batches")
print(f"{len(validation_loader)} validation batches")
print(f"{len(test_loader)} test batches")

131 training batches
19 validation batches
38 test batches


In [90]:
print(131+19+38)

188


In [54]:
#initializing a model with pretained weights for finetuning
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASIC_CONFIG = {
    "vocab_size": 50257,
    "max_seq_len": 1024,
    "dropout": 0.0,
    "embed_dim": 768,
    "num_layers": 12,
    "num_heads": 12
}




model_configs = {
    "gpt2-small (124M)" : {"embed_dim": 768, "num_layers": 12, "num_heads": 12},
    "gpt2-medium (355M)" : {"embed_dim": 1024, "num_layers": 24, "num_heads": 16},
    "gpt2-large (774M)" : {"embed_dim": 1280, "num_layers": 36, "num_heads": 20},
    "gpt2-xl (1558M)" : {"embed_dim": 1600, "num_layers": 48, "num_heads": 25},
}

BASIC_CONFIG = {
    "vocab_size": 50257,
    "max_seq_len": 1024,
    "dropout": 0.0
}

BASIC_CONFIG.update(model_configs[CHOOSE_MODEL])

assert train_dataset.max_length <= BASIC_CONFIG["max_seq_len"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASIC_CONFIG['max_seq_len']}. Reinitialize datasets with "
    f"max_length={BASIC_CONFIG['max_seq_len']}"
)



In [55]:
#downloading and loading the model
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip(" (").rstrip(")")
from gpt_download3 import download_and_load_gpt2



In [56]:

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# ========== GPT2 Model Definition ==========
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv_proj(x).view(B, T, 3, self.num_heads, self.head_dim)
        q, k, v = qkv.unbind(dim=2)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_scores = attn_scores.masked_fill(torch.tril(torch.ones(T, T, device=x.device)) == 0, float('-inf'))
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        out = attn_weights @ v
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.o_proj(out)

class FeedForward(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.GELU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.ffn = FeedForward(embed_dim, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x

class GPTModel(nn.Module):
    def __init__(self, vocab_size, max_seq_len, embed_dim, num_heads, num_layers, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_seq_len, embed_dim)
        self.trf_block = nn.Sequential(*[
            TransformerBlock(embed_dim, num_heads, dropout)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.final_norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        B, T = x.size()
        tok_embed = self.token_emb(x)
        pos_ids = torch.arange(T, device=x.device).unsqueeze(0)
        pos_embed = self.pos_emb(pos_ids)
        x = tok_embed + pos_embed
        x = self.trf_block(x)
        x = self.dropout(x)
        x = self.final_norm(x)
        logits = self.lm_head(x)
        return logits


In [60]:
#loading the pretrained weights
settings, params =  download_and_load_gpt2(model_size = model_size, models_dir = "gpt2")
model = GPTModel(**BASIC_CONFIG)
def load_weights_into_gpt(model, params):
    model_dict = model.state_dict()
    for name, param in params.items():
        if name in model_dict:
            try:
                model_dict[name].copy_(param)
            except Exception as e:
                print(f"Error loading {name}: {e}")


load_weights_into_gpt(model, params)
model.eval();



File already exists and is up-to-date: gpt2\124M\checkpoint




File already exists and is up-to-date: gpt2\124M\encoder.json




File already exists and is up-to-date: gpt2\124M\hparams.json




File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\124M\model.ckpt.index




File already exists and is up-to-date: gpt2\124M\model.ckpt.meta




File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [80]:
#testing whether the model was loaded successfully
import torch

# Input text
text_1 = "Every effort moves you"

# Tokenizer utilities
def text_to_token_ids(text, tokenizer):
    return torch.tensor(tokenizer.encode(text), dtype=torch.long)

def token_ids_to_text(token_ids, tokenizer):
    return tokenizer.decode(token_ids.tolist(), skip_special_tokens=True)

# Text generation
@torch.no_grad()
@torch.no_grad()
def generate_text_simple(model, idx, max_new_tokens, context_size):
    model.eval()
    idx = idx.clone().detach().unsqueeze(0)  # (1, sequence_length)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        outputs = model(idx_cond)
        logits = outputs.logits  # likely shape (1, vocab_size)

        # If logits is 2D, skip indexing; otherwise take last token
        if logits.dim() == 3:
            logits = logits[:, -1, :]
        
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, next_token), dim=1)

    return idx[0]

# Generate text
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASIC_CONFIG["max_seq_len"]
)

# Decode and print generated text
print(token_ids_to_text(token_ids, tokenizer))


every effort moves you [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0] [unused0]


In [66]:
#from huggingface_hub import login

#login(token="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")


In [68]:
#from huggingface_hub import whoami

#user_info = whoami()
#print(user_info)


{'type': 'user', 'id': '683cb0c3a69d5f276bbb4d97', 'name': 'Okoth67', 'fullname': 'Brian Okoth', 'email': 'bbollo386@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/4af57a6209ab5488520bc60485f444df.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'llm-1', 'role': 'read', 'createdAt': '2025-06-07T09:51:00.695Z'}}}


In [73]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
#hg_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

#print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [74]:
print(model)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [81]:
#freezing the model before classification finetuning
for param in model.parameters():
    param.requires_grad = False

In [82]:
#then we replace the output layer (model.out_head) 
torch.manual_seed(123)
num_classes = 2
model.out_head =  torch.nn.Linear(in_features = BASIC_CONFIG["embed_dim"], out_features = num_classes)

In [84]:
#lastly we configure the last transformer block and LayerNorm module
# Unfreeze the last transformer block
for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True

# Unfreeze the final LayerNorm (called output_layer_norm in the last transformer block)
for param in model.distilbert.transformer.layer[-1].output_layer_norm.parameters():
    param.requires_grad = True


In [85]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs :", inputs)
print("Inputs dimensions: ", inputs.shape)

Inputs : tensor([[ 101, 2079, 2017, 2031, 2051,  102]])
Inputs dimensions:  torch.Size([1, 6])


In [87]:
#then we can pass encoded token IDs to the model as usual
with torch.no_grad():
    outputs = model(inputs)
print("Outputs: ", outputs)
print("Output dimensions: ", outputs.logits.shape)

Outputs:  SequenceClassifierOutput(loss=None, logits=tensor([[ 1.7597, -1.4899]]), hidden_states=None, attentions=None)
Output dimensions:  torch.Size([1, 2])


In [90]:
#to extract the last output token from the output tensor
print("Logits:", outputs.logits)
print("Shape:", outputs.logits.shape)  # Should be (batch_size, num_labels)


Logits: tensor([[ 1.7597, -1.4899]])
Shape: torch.Size([1, 2])
