In [51]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#Run the below piece of code for the first time
#nltk.download('stopwords')

In [36]:
message_data = pd.read_csv("spam.csv",encoding = "latin")
message_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [37]:
message_data = message_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [38]:
message_data = message_data.rename(columns = {'v1':'Spam/Not_Spam','v2':'message'})

In [39]:
message_data.groupby('Spam/Not_Spam').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
Spam/Not_Spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [47]:
message_data_copy = message_data['message'].copy()

In [48]:
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [49]:
message_data_copy = message_data_copy.apply(text_preprocess)

In [50]:
message_data_copy

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4             Nah dont think goes usf lives around though
5       FreeMsg Hey darling 3 weeks word back Id like ...
6          Even brother like speak treat like aids patent
7       per request Melle Melle Oru Minnaminunginte Nu...
8       WINNER valued network customer selected receiv...
9       mobile 11 months U R entitled Update latest co...
10      Im gonna home soon dont want talk stuff anymor...
11      SIX chances win CASH 100 20000 pounds txt CSH1...
12      URGENT 1 week FREE membership å£100000 Prize J...
13      Ive searching right words thank breather promi...
14                                            DATE SUNDAY
15      XXXMobileMovieClub use credit click WAP link n...
16                                        Oh kim watching
17      Eh u r

In [57]:
vectorizer = TfidfVectorizer("english")

In [58]:
message_mat = vectorizer.fit_transform(message_data_copy)
message_mat

<5572x9376 sparse matrix of type '<class 'numpy.float64'>'
	with 47254 stored elements in Compressed Sparse Row format>

In [60]:
message_train, message_test, spam_nospam_train, spam_nospam_test = train_test_split(message_mat, 
                                                        message_data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(message_train, spam_nospam_train)
pred = Spam_model.predict(message_test)
accuracy_score(spam_nospam_test,pred)

0.9383971291866029

Let's try using stemming and normalizing length of the messages

In [63]:
def stemmer (text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [64]:
message_data_copy = message_data_copy.apply(stemmer)
vectorizer = TfidfVectorizer("english")
message_mat = vectorizer.fit_transform(message_data_copy)

In [65]:
message_train, message_test, spam_nospam_train, spam_nospam_test = train_test_split(message_mat, 
                                                        message_data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(message_train, spam_nospam_train)
pred = Spam_model.predict(message_test)
accuracy_score(spam_nospam_test,pred)

0.9461722488038278

Accuracy score improved. Let's try normalizing length.

In [68]:
message_data['length'] = message_data['message'].apply(len)
message_data.head()

Unnamed: 0,Spam/Not_Spam,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [69]:
length = message_data['length'].as_matrix()
new_mat = np.hstack((message_mat.todense(),length[:, None]))

In [70]:
message_train, message_test, spam_nospam_train, spam_nospam_test = train_test_split(new_mat, 
                                                        message_data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(message_train, spam_nospam_train)
pred = Spam_model.predict(message_test)
accuracy_score(spam_nospam_test,pred)

0.9467703349282297