# Compare NLP Techniques: Prep The Data For Modeling

### Read In & Clean Text

In [1]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')


In [2]:
messages

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
messages = messages.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)

In [4]:
messages

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
messages.columns = ["label", "text"]

In [6]:

messages['label'] = np.where(messages['label']=='spam', 1, 0)

In [7]:



def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [8]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2)

In [9]:
# What do the first ten messages in the training set look like?
X_train[:]

5484    [, picking, various, points, going, 2, yeovil,...
3804    [dude, makin, weirdy, brownies, sister, made, ...
1105                           [hen, night, going, swing]
1764    [hi, 07734396839, ibh, customer, loyalty, offe...
3964                            [love, aathilove, u, lot]
                              ...                        
1675    [painful, words, thought, happy, toughest, thi...
3557                   [da, vijay, going, talk, jaya, tv]
777     [hi, kate, lovely, see, tonight, ill, phone, t...
3735                   [hows, street, end, library, walk]
3441    [save, money, wedding, lingerie, wwwbridalpett...
Name: clean_text, Length: 4457, dtype: object

In [10]:
# What do the labels look like?
y_train[:10]

5484    0
3804    0
1105    0
1764    1
3964    0
5554    0
3160    0
3229    0
2528    0
1995    0
Name: label, dtype: int32

In [11]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('../../../data/X_train.csv', index=False, header=True)
X_test.to_csv('../../../data/X_test.csv', index=False, header=True)
y_train.to_csv('../../../data/y_train.csv', index=False, header=True)
y_test.to_csv('../../../data/y_test.csv', index=False, header=True)