Build a prototype for sms spam classification 
- in prepare.ipynb write the functions to 
    - load the data from a given file path
    - preprocess the data (if needed)
    - split the data into train/validation/test 
    - store the splits at train.csv/validation.csv/test.csv
- in train.ipynb write the functions to
    - fit a model on train data
    - score a model on given data
    - evaluate the model predictions
    - validate the model
        - fit on train
        - score on train and validation
        - evaluate on train and validation
        - fine-tune hyper-params using train and validation (if necessary)
- score three benchmark models on test data and select the best one


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data

In [2]:
FILE_PATH = 'sms_spam_collection/SMSSpamCollection'

In [3]:
df = pd.read_csv(FILE_PATH, sep='\t', names=['label', 'message'])

In [4]:
df.sample(5)


Unnamed: 0,label,message
4036,spam,YOU ARE CHOSEN TO RECEIVE A £350 AWARD! Pls ca...
1717,ham,Sorry about earlier. Putting out fires.Are you...
664,ham,Leave it de:-). Start Prepare for next:-)..
1626,ham,Hi Dear Call me its urgnt. I don't know whats ...
38,ham,Anything lor... U decide...


# Preprocess the data
* lowercase the text
* tokenize the text
* remove stop words


In [5]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
pd.set_option('display.max_colwidth', None)

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)


In [9]:
df['preprocessed_message'] = df['message'].apply(preprocess_text)

In [10]:
df.sample(10)

Unnamed: 0,label,message,preprocessed_message
5378,spam,"Free entry to the gr8prizes wkly comp 4 a chance to win the latest Nokia 8800, PSP or £250 cash every wk.TXT GREAT to 80878 http//www.gr8prizes.com 08715705022",free entry grprizes wkly comp chance win latest nokia psp £ cash every wktxt great httpwwwgrprizescom
3318,ham,"No worries, hope photo shoot went well. have a spiffing fun at workage.",worry hope photo shoot went well spiffing fun workage
1247,ham,"I do know what u mean, is the king of not havin credit! I'm goin2bed now. Night night sweet! Only1more sleep!",know u mean king havin credit im goinbed night night sweet onlymore sleep
4942,ham,Omg you can make a wedding chapel in frontierville? Why do they get all the good stuff?,omg make wedding chapel frontierville get good stuff
643,ham,Probably gonna swing by in a wee bit,probably gon na swing wee bit
3347,ham,U're welcome... Caught u using broken english again...,ure welcome caught u using broken english
2772,ham,Then ur sis how?,ur si
2958,ham,I would but I'm still cozy. And exhausted from last night.nobody went to school or work. Everything is closed.,would im still cozy exhausted last nightnobody went school work everything closed
3058,ham,Just woke up. Yeesh its late. But I didn't fall asleep til &lt;#&gt; am :/,woke yeesh late didnt fall asleep til ltgt
2769,ham,I am on the way to ur home,way ur home


In [11]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [12]:
df['label'].value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [13]:
# split the data into train/validation/test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [14]:
# print percentage of each label
print("Train : ", train_df['label'].value_counts() / len(train_df))
print("Validation : ", val_df['label'].value_counts() / len(val_df))
print("Test : ", test_df['label'].value_counts() / len(test_df))

Train :  label
0    0.866547
1    0.133453
Name: count, dtype: float64
Validation :  label
0    0.863677
1    0.136323
Name: count, dtype: float64
Test :  label
0    0.866368
1    0.133632
Name: count, dtype: float64


In [17]:
# store the splits at train.csv/validation.csv/test.csv
train_df.to_csv('processed_data/train.csv', index=False, sep='\t')
val_df.to_csv('processed_data/validation.csv', index=False, sep='\t')
test_df.to_csv('processed_data/test.csv', index=False, sep='\t')
