In [1]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.stem.porter import PorterStemmer
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv("/content/spam.csv", encoding = 'latin-1')
df.drop(columns = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.rename(columns = {'v1': 'Target', 'v2': 'Message'}, inplace = True)
df.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['Target'] = df['Target'].apply(lambda x: 0 if x == 'ham'else 1)
df['Target'].value_counts()

0    4825
1     747
Name: Target, dtype: int64

In [4]:
ham_df = df[df['Target']==0]
spam_df = df[df['Target']==1]
ham_downsampled = ham_df.sample(spam_df.shape[0])
df_balanced = pd.concat([ham_downsampled, spam_df])
df_balanced.reset_index(drop = True, inplace = True)
df_balanced.isna().sum()

Target     0
Message    0
dtype: int64

In [5]:
print(df_balanced['Target'].value_counts())
df_balanced.head()

0    747
1    747
Name: Target, dtype: int64


Unnamed: 0,Target,Message
0,0,A pure hearted person can have a wonderful smi...
1,0,Mmm thats better now i got a roast down me! iå...
2,0,"Call me when you/carlos is/are here, my phone'..."
3,0,Tell me whos this pls:-)
4,0,"Hey, a guy I know is breathing down my neck to..."


In [6]:
txt = []
corpus = list(df_balanced['Message'])
for i in range(len(corpus)):
    r = re.sub('[^a-zA-Z]', ' ', corpus[i])
    r = r.lower()
    r = r.split()
    r = ' '.join(r)
    txt.append(r)
df_balanced['Message'] = txt
print(df_balanced['Message'][:3])

0    a pure hearted person can have a wonderful smi...
1    mmm thats better now i got a roast down me i d...
2    call me when you carlos is are here my phone s...
Name: Message, dtype: object


In [8]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
df_balanced['Message'] = df_balanced['Message'].apply(lambda x: word_tokenize(x))
print(df_balanced['Message'][:3])

0    [a, pure, hearted, person, can, have, a, wonde...
1    [mmm, thats, better, now, i, got, a, roast, do...
2    [call, me, when, you, carlos, is, are, here, m...
Name: Message, dtype: object


In [11]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_list = []
for i in range(len(df_balanced)):
        msg = df_balanced['Message'][i]
        msg = [word for word in msg if word not in stop_words]
        stop_list.append(msg)
df_balanced['Message'] = stop_list
print(df_balanced['Message'][:3])

0    [pure, hearted, person, wonderful, smile, make...
1    [mmm, thats, better, got, roast, b, better, dr...
2    [call, carlos, phone, vibrate, acting, might, ...
Name: Message, dtype: object


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
ps = PorterStemmer()
stem_list = []
for i in range(len(df_balanced)):
    txt = df_balanced['Message'][i]
    txt = [ps.stem(word) for word in txt]
    stem_list.append(txt)
df_balanced['Message'] = stem_list
print(df_balanced['Message'][:3])

0    [pure, heart, person, wonder, smile, make, eve...
1    [mmm, that, better, got, roast, b, better, dri...
2    [call, carlo, phone, vibrat, act, might, hear,...
Name: Message, dtype: object


In [13]:
corpus = []
for i in df_balanced['Message']:
    msg = ' '.join(row for row in i)
    corpus.append(msg)
df_balanced['Message'] = corpus
print(df_balanced['Message'][:3])

0    pure heart person wonder smile make even enemi...
1    mmm that better got roast b better drink good ...
2          call carlo phone vibrat act might hear text
Name: Message, dtype: object


In [14]:
x_train, x_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['Target'], test_size = 0.2, random_state = 42)

In [15]:
tv = TfidfVectorizer()
x_train_tv = tv.fit_transform(x_train)
x_test_tv = tv.transform(x_test)

In [16]:
nb_model = MultinomialNB()
nb_model.fit(x_train_tv, y_train)
nb_predict = nb_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, nb_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, nb_predict), 2))

Precision   0.94
Accuracy   0.95


In [17]:
cv_score = cross_val_score(nb_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.94


In [18]:
lr_model = LogisticRegression()
lr_model.fit(x_train_tv, y_train)
lr_predict = lr_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, lr_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, lr_predict), 2))

Precision   0.97
Accuracy   0.95


In [19]:
cv_score = cross_val_score(lr_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.95


In [20]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_tv, y_train)
rf_predict = rf_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, rf_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, rf_predict), 2))

Precision   0.95
Accuracy   0.93


In [21]:
cv_score = cross_val_score(rf_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.96


In [22]:
svm_model = svm.SVC()
svm_model.fit(x_train_tv, y_train)
svm_predict = svm_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, svm_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, svm_predict), 2))

Precision   0.97
Accuracy   0.94


In [23]:
cv_score = cross_val_score(svm_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.95


In [24]:
k_model = KNeighborsClassifier()
k_model.fit(x_train_tv,y_train)
k_predict = k_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, k_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, k_predict), 2))

Precision   0.98
Accuracy   0.84


In [25]:
cv_score = cross_val_score(k_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.9
