### Read In & Clean Text

In [1]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['label'] = np.where(messages['label']=='spam', 1, 0)

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()
messages_clean = messages.drop(['text'], axis=1)
messages_clean.head()

Unnamed: 0,label,clean_text
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, dont, think, goes, usf, lives, around, t..."


In [2]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2)
print(X_train)
print(y_train)

2053    [oh, thkin, goin, yogasana, 10, den, nd, go, 3...
3308    [okie, ì, wan, meet, bishan, cos, bishan, im, ...
1380                              [dnt, wnt, tlk, wid, u]
1654    [wishing, family, merry, x, mas, happy, new, y...
4206                  [lets, use, next, week, princess, ]
                              ...                        
5482    [urgent, trying, contact, last, weekends, draw...
1519           [check, wid, corect, speling, ie, sarcasm]
70                    [wah, lucky, man, save, money, hee]
1837                                      [hows, husband]
611     [valentine, game, send, dis, msg, ur, friends,...
Name: clean_text, Length: 4457, dtype: object
2053    0
3308    0
1380    0
1654    0
4206    0
       ..
5482    1
1519    0
70      0
1837    0
611     0
Name: label, Length: 4457, dtype: int64


In [3]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('data/X_train.csv', index=False, header=True)
X_test.to_csv('data/X_test.csv', index=False, header=True)
y_train.to_csv('data/y_train.csv', index=False, header=True)
y_test.to_csv('data/y_test.csv', index=False, header=True)