## import stuff

In [1]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

## For preprocessing, my hypothesis is that words with special characters or capitalization, like "FREE!!!", have a higher likelihood of indicating spam. Removing them might strip away valuable features that help distinguish spam from ham. Instead, I focus on lemmatization and stopword removal, as these steps help normalize the text while preserving its meaningful content.

In [2]:
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

In [3]:
file_path = "sms+spam+collection/SMSSpamCollection"

ham_messages = []
spam_messages = []

In [4]:
# Read File
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

for line in tqdm(lines, desc="reading file", unit="line"):
    if line.startswith("ham"):
        ham_messages.append(preprocess_text(line[4:].strip()))
    elif line.startswith("spam"):
        spam_messages.append(preprocess_text(line[5:].strip()))

reading file: 100%|█████████████████████| 5574/5574 [00:03<00:00, 1498.19line/s]


In [5]:
data = pd.DataFrame({
    "text": ham_messages + spam_messages,
    "label": [0] * len(ham_messages) + [1] * len(spam_messages)  # 1 for spam, 0 otw
})

train, temp_df = train_test_split(data, test_size=0.5)
validation, test = train_test_split(temp_df, test_size=0.5)
#not stratifying to not add bias

print(len(train),len(validation),len(test)) # 50-25-25 split
train.to_csv("train.csv", index=False)
validation.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

2787 1393 1394


In [6]:
data

Unnamed: 0,text,label
0,"Go jurong point , crazy .. Available bugis n g...",0
1,Ok lar ... Joking wif u oni ...,0
2,U dun say early hor ... U c already say ...,0
3,"Nah n't think go usf , life around though",0
4,Even brother like speak . treat like aid patent .,0
...,...,...
5569,Want explicit SEX 30 sec ? Ring 02073162414 ! ...,1
5570,ASKED 3MOBILE 0870 CHATLINES INCLU FREE MINS ....,1
5571,"contract mobile 11 Mnths ? Latest Motorola , N...",1
5572,REMINDER O2 : get 2.50 pound free call credit ...,1


In [7]:
train

Unnamed: 0,text,label
2031,"money issue weigh thanks , breathe easier . I....",0
3954,buy blackberry bold 2 torch . buy new used . L...,0
2284,friend use call .,0
3679,Aah ! cuddle would lush ! 'd need lot tea soup...,0
2077,Lol please . Actually send pic right . wan na ...,0
...,...,...
257,"Awesome , remember last time got somebody high...",0
2174,Ok . ask abt e movie . U wan ktv oso ?,0
5077,Call FREEPHONE 0800 542 0578 !,1
2119,Nope . Meanwhile talk say make greet .,0
