In [48]:
import urllib.request
import ssl
import zipfile
import os


from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists, skipping download and unzipping.')
        return

    # Download the file WITHOUT custom SSL context
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())

    # Unzip the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Rename the file to have .tsv extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f'File downloaded and saved as {data_file_path}')


# Run the function
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


sms_spam_collection\SMSSpamCollection.tsv already exists, skipping download and unzipping.


In [49]:
import pandas as pd

# Update the path if necessary
data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

# Load it into a DataFrame
df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

# Show first 5 rows
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [50]:
#creating a value counts to see the classifications
print(df["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [51]:
#we then proceed to make the dataset balance
# Load the dataset (adjust the path if needed)

data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

def create_balanced_dataset(df):
    # Count the number of spam messages
    num_spam = df[df["label"] == 'spam'].shape[0]

    # Randomly sample ham messages to match the number of spam messages
    ham_subset = df[df["label"] == "ham"].sample(num_spam, random_state=123)

    # Combine the ham subset with all spam messages
    balanced_df = pd.concat([ham_subset, df[df["label"] == "spam"]])

    return balanced_df

# Now create the balanced dataset
balanced_df = create_balanced_dataset(df)

# Check the class distribution
print(balanced_df['label'].value_counts())


label
ham     747
spam    747
Name: count, dtype: int64


In [52]:
#we then convert the string class labels ie ham and spam into 1 and 0 respectively
balanced_df['label'] = balanced_df['label'].map({'ham':0, 'spam':1})

In [53]:
#create a random split function to split the dataset into 3
#the 3 are training data, validation data and test data
#the ratio is usually 7.1.2
#or 70% to train, 10% to validate and 20% to test
def random_split(df, train_frac, validation_frac):
    #shuffle the entire dataset first
    df = df.sample(frac = 1, random_state = 123).reset_index(drop = True)

    #calc split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    #split the dataframe
    train_df = df[:train_end]
    validation_df = df[train_end : validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)


In [54]:
print(len(train_df))
print(len(validation_df))
print(len(test_df))

1045
149
300


In [64]:
#save the dataframes to csv file to reuse later
train_df.to_csv('train.csv', index = None)
validation_df.to_csv('validation.csv', index = None)
test_df.to_csv('test.csv', index = None)



In [75]:
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        # Drop rows where 'label' or 'message' is NaN
        self.data = self.data.dropna(subset=['label', 'message'])

        self.tokenizer = tokenizer
        self.pad_token_id = pad_token_id

        self.label2id = {'ham': 0, 'spam': 1}

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data['message']
        ]

        if max_length is None:
            self.max_length = max(len(x) for x in self.encoded_texts)
        else:
            self.max_length = max_length

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = int(self.data.iloc[index]['label'])  # Use directly if already 0/1
        return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

    def __len__(self):
        return len(self.data)


In [76]:
train_dataset = SpamDataset(csv_file='train.csv', tokenizer=tokenizer)

print(train_dataset.data['label'].isna().sum())     # Check how many missing labels
print(train_dataset.data['message'].isna().sum())   # Check how many missing messages
print(train_dataset.data['label'].unique())         # See label values (e.g., 'ham', 'spam')



0
0
[0 1]


In [77]:


#padding the train dataset
train_dataset = SpamDataset(
    csv_file = "train.csv",
    max_length = None,
    tokenizer = tokenizer
)

#padding the validation dataset
validation_dataset = SpamDataset(
    csv_file = "validation.csv",
    max_length = train_dataset.max_length,
    tokenizer = tokenizer
)

#padding the test dataset
test_dataset = SpamDataset(
    csv_file = "test.csv",
    max_length = train_dataset.max_length,
    tokenizer = tokenizer
)
print(train_dataset.max_length)
print(validation_dataset.max_length)
print(test_dataset.max_length)

120
120
120


In [12]:
import os
print(os.listdir())


['.bash_history', '.cache', '.conda', '.git', '.gitattributes', '.gitconfig', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.ms-ad', '.viminfo', '.vscode', '4.66', 'App', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'Desktop', 'Documents', 'downloading_and_processing_dataset.ipynb', 'Downloads', 'Evaluating LLM Performance on Real Dataset.ipynb', 'Favorites', 'get_download.py', 'gpt2', 'gpt2 weights saving and loading.ipynb', 'gpt2.ipynb.txt', 'gpt2model.pth', 'gpt2_dummy_weights.pth', 'gpt_download3.py', 'IntelGraphicsProfiles', 'Links', 'llm-1', 'loading_and_saving_openai_weights.ipynb', 'Local Settings', 'local_backup_loading_and_saving.ipynb', 'miniconda3', 'model_and_optimizer.pth', 'model_and_optimizer2.pth', 'Music', 'My Documents', 'NetHood', 'New folder', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{7376fb3e-2479-11f0-801d-80b6559a5fec}.TM.blf', 'NTUSER.DAT{7376fb3e-2479-11f0-801d-80b6559a5fec}.TMContainer00000000000000000001.regtrans-ms'

In [78]:
import pandas as pd

df = pd.read_csv("train.csv")
print(df.columns)



Index(['label', 'message'], dtype='object')


In [87]:
#instantiating the dataloaders
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = torch.stack([item[1] for item in batch])
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    return inputs_padded, labels
    
num_workers = 0
batch_size = 8

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)

validation_loader = DataLoader(
    dataset = validation_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn=collate_fn,
)


In [88]:
#to ensure that the dataloaders are working and are indeed returning batches of the expected size, we iterate over the training
#loader and the print the tensor dimensions of the last batch
print("Train Loader:")
for input_batch, target_batch in train_loader:
    pass

print("Input batch dimensions: ", input_batch.shape)
print("Label batch dimensions: ", target_batch.shape)

Train Loader:
Input batch dimensions:  torch.Size([5, 50])
Label batch dimensions:  torch.Size([5])


In [89]:
#print the no of batches in each dataset
print(f"{len(train_loader)} training batches")
print(f"{len(validation_loader)} validation batches")
print(f"{len(test_loader)} test batches")

131 training batches
19 validation batches
38 test batches


In [90]:
print(131+19+38)

188
