In [25]:
import pandas as pd
import numpy as np
import nltk

In [26]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
# Features and Labels
df['label'] = df['class'].map({'ham': 0, 'spam': 1})


In [27]:
#Lower casing
df['message'] = df['message'].str.lower()

In [29]:
df.head()

Unnamed: 0,class,message,label
0,ham,"go until jurong point, crazy.. available only ...",0
1,ham,ok lar... joking wif u oni...,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor... u c already then say...,0
4,ham,"nah i don't think he goes to usf, he lives aro...",0


In [30]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#nltk.download()

#Tokenize the words
lines = df.message
tokenize = []
for word in lines:
    tokenize.append(word_tokenize(word))

In [33]:
df.insert(1,"tokenized",tokenize)
df.head()

Unnamed: 0,class,tokenized,message,label
0,ham,"[go, until, jurong, point, ,, crazy.., availab...","go until jurong point, crazy.. available only ...",0
1,ham,"[ok, lar, ..., joking, wif, u, oni, ...]",ok lar... joking wif u oni...,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,"[u, dun, say, so, early, hor, ..., u, c, alrea...",u dun say so early hor... u c already then say...,0
4,ham,"[nah, i, do, n't, think, he, goes, to, usf, ,,...","nah i don't think he goes to usf, he lives aro...",0


In [8]:
#removed '.' and ',' completely
clean = [[word for word in lines if word != ',' if word !='.'] for lines in df['tokenized']]
df.insert(2,"clean",clean)

In [10]:
#Stem the words
porter_stemmer = PorterStemmer()
#Stem the words
stemmed_words = [[porter_stemmer.stem(word = word) for word in lines] for lines in df['clean']]
df.insert(3,"stemmed",stemmed_words)

In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
lemmatized = [[lemmatizer.lemmatize(word=word,pos='v') for word in lines] for lines in df['stemmed']]
df.insert(4,"lemmatized",lemmatized)

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

In [18]:
stop_remove = [[lemmatizer.lemmatize(word=word,pos='v') for word in lines if word not in stop_words] for lines in df['stemmed']]

In [19]:
df.insert(5,"rem_stop",stop_remove)

In [21]:
df['rem_stop'] = df['rem_stop'].apply(', '.join)


In [23]:
X = df['rem_stop']
y = df['label']
X.head()

0    go, jurong, point, crazy.., avail, onli, bugi,...
1                 ok, lar, ..., joke, wif, u, oni, ...
2    free, entri, 2, wkli, comp, win, fa, cup, fina...
3    u, dun, say, earli, hor, ..., u, c, alreadi, s...
4      nah, n't, think, goe, usf, live, around, though
Name: rem_stop, dtype: object