In [5]:
import urllib.request
import ssl
import zipfile
import os


from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f'{data_file_path} already exists, skipping download and unzipping.')
        return

    # Download the file WITHOUT custom SSL context
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())

    # Unzip the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Rename the file to have .tsv extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f'File downloaded and saved as {data_file_path}')


# Run the function
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


sms_spam_collection\SMSSpamCollection.tsv already exists, skipping download and unzipping.


In [6]:
import pandas as pd

# Update the path if necessary
data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

# Load it into a DataFrame
df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

# Show first 5 rows
print(df.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
#creating a value counts to see the classifications
print(df["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [8]:
#we then proceed to make the dataset balance
# Load the dataset (adjust the path if needed)

data_file_path = "sms_spam_collection/SMSSpamCollection.tsv"

df = pd.read_csv(data_file_path, sep='\t', header=None, names=["label", "message"])

def create_balanced_dataset(df):
    # Count the number of spam messages
    num_spam = df[df["label"] == 'spam'].shape[0]

    # Randomly sample ham messages to match the number of spam messages
    ham_subset = df[df["label"] == "ham"].sample(num_spam, random_state=123)

    # Combine the ham subset with all spam messages
    balanced_df = pd.concat([ham_subset, df[df["label"] == "spam"]])

    return balanced_df

# Now create the balanced dataset
balanced_df = create_balanced_dataset(df)

# Check the class distribution
print(balanced_df['label'].value_counts())


label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
#we then convert the string class labels ie ham and spam into 1 and 0 respectively
balanced_df['label'] = balanced_df['label'].map({'ham':0, 'spam':1})

In [10]:
#create a random split function to split the dataset into 3
#the 3 are training data, validation data and test data
#the ratio is usually 7.1.2
#or 70% to train, 10% to validate and 20% to test
def random_split(df, train_frac, validation_frac):
    #shuffle the entire dataset first
    df = df.sample(frac = 1, random_state = 123).reset_index(drop = True)

    #calc split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    #split the dataframe
    train_df = df[:train_end]
    validation_df = df[train_end : validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)


In [12]:
print(len(train_df))
print(len(validation_df))
print(len(test_df))

1045
149
300


In [13]:
#save the dataframes to csv file to reuse later
train_df.to_csv('train.csv', index = None)
validation_df.to_csv('validation.csv', index = None)
test_df.to_csv('test.csv', index = None)
