In [1]:
# import libraries

import numpy as np
import pandas as pd

In [2]:
# Loading the dataset

df = pd.read_csv("C:/Users/narai/Downloads/spam.csv",encoding='latin-1')

In [3]:
# to get a short preview of the dataset

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# To drop the unecessary columns

df.drop(columns = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], inplace = True)

In [5]:
# Renaming the columns to suitable names

df.rename(columns = {'v1': 'Target', 'v2': 'Message'}, inplace = True)
df.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Terming 'Spam' as 1 and 'Ham' as 0 in Target column

df['Target'] = df['Target'].apply(lambda x: 0 if x == 'ham'else 1)
df['Target'].value_counts()

0    4825
1     747
Name: Target, dtype: int64

In [7]:
# to get a short preview of the dataset

df.head()

Unnamed: 0,Target,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


##  Handling Class Imbalance

In [8]:
ham_df = df[df['Target']==0]
spam_df = df[df['Target']==1]

In [9]:
# Undersampling 

ham_under_sampled = ham_df.sample(spam_df.shape[0])
df_balanced = pd.concat([ham_under_sampled, spam_df])
df_balanced.reset_index(drop = True, inplace = True)
df_balanced.isna().sum()

Target     0
Message    0
dtype: int64

In [10]:
# balanced data valuecounts

print(df_balanced['Target'].value_counts())
df_balanced.head()

1    747
0    747
Name: Target, dtype: int64


Unnamed: 0,Target,Message
0,0,LOL ... No just was busy
1,0,"K, want us to come by now?"
2,0,"Like &lt;#&gt; , same question"
3,0,I accidentally deleted the message. Resend ple...
4,0,Re your call; You didn't see my facebook huh?


In [11]:
# importing regular expression

import re 

In [12]:
txt = []
corpus = list(df_balanced['Message'])
for i in range(len(corpus)):
    r = re.sub('[^a-zA-Z]', ' ', corpus[i])
    r = r.lower()
    r = r.split()
    r = ' '.join(r)
    txt.append(r)
df_balanced['Message'] = txt
print(df_balanced['Message'][:3])

0        lol no just was busy
1    k want us to come by now
2    like lt gt same question
Name: Message, dtype: object


In [13]:
# Importing nltk libraries

from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [14]:
# word tokeniser

df_balanced['Message'] = df_balanced.apply(lambda row: word_tokenize(row['Message']), axis = 1)
print(df_balanced['Message'][:3])

0          [lol, no, just, was, busy]
1    [k, want, us, to, come, by, now]
2      [like, lt, gt, same, question]
Name: Message, dtype: object


In [15]:
# Removing Stop words

stop_words = set(stopwords.words('english'))
stop_list = []
for i in range(len(df_balanced)):
        msg = df_balanced['Message'][i]
        msg = [word for word in msg if word not in stop_words]
        stop_list.append(msg)
df_balanced['Message'] = stop_list
print(df_balanced['Message'][:3])

0                 [lol, busy]
1         [k, want, us, come]
2    [like, lt, gt, question]
Name: Message, dtype: object


In [16]:
# To remove Suffixes from words using porter stemmer 

ps = PorterStemmer()
stem_list = []
for i in range(len(df_balanced)):
    txt = df_balanced['Message'][i]
    txt = [ps.stem(word) for word in txt]
    stem_list.append(txt)
df_balanced['Message'] = stem_list
print(df_balanced['Message'][:3])

0                 [lol, busi]
1         [k, want, us, come]
2    [like, lt, gt, question]
Name: Message, dtype: object


In [17]:
corpus = []
for i in df_balanced['Message']:
    msg = ' '.join(row for row in i)
    corpus.append(msg)
df_balanced['Message'] = corpus
print(df_balanced['Message'][:3])

0               lol busi
1         k want us come
2    like lt gt question
Name: Message, dtype: object


In [18]:
# Train test split 

from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['Target'], test_size = 0.3, random_state = 100)


In [20]:
# Importing Tfidvectoriser and countVectoriser from sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
# to fit_tranform using TfidfVectoriser 

tv = TfidfVectorizer()
x_train_tv = tv.fit_transform(x_train)
x_test_tv = tv.transform(x_test)

## Model Building 

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [23]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [24]:
# Logistic regression model 

lr_model = LogisticRegression()
lr_model.fit(x_train_tv, y_train)
lr_predict = lr_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, lr_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, lr_predict), 2))

Precision   0.99
Accuracy   0.96


In [25]:
# cross validation score

cv_score = cross_val_score(lr_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.94


In [26]:

nb_model = MultinomialNB()
nb_model.fit(x_train_tv, y_train)
nb_predict = nb_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, nb_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, nb_predict), 2))

Precision   0.93
Accuracy   0.95


In [27]:
cv_score = cross_val_score(nb_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.94


In [28]:
# Randomforest classifer model 

rf_model = RandomForestClassifier()
rf_model.fit(x_train_tv, y_train)
rf_predict = rf_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, rf_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, rf_predict), 2))

Precision   1.0
Accuracy   0.98


In [29]:
 # Cross validation score 
    
cv_score = cross_val_score(rf_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.95


In [30]:
# importing KNN classifer from sklearn

from sklearn.neighbors import KNeighborsClassifier

In [31]:
# KNN classifer model 

k_model = KNeighborsClassifier()
k_model.fit(x_train_tv,y_train)
k_predict = k_model.predict(x_test_tv)
print('Precision', ' ', round(metrics.precision_score(y_test, k_predict), 2))
print('Accuracy', ' ', round(metrics.accuracy_score(y_test, k_predict), 2))

Precision   0.95
Accuracy   0.93


In [32]:
# Cross validation score 

cv_score = cross_val_score(k_model, x_train_tv, y_train, scoring='accuracy', cv=10)
print('Cross Validated Accuracy:', round(cv_score.mean(),2))

Cross Validated Accuracy: 0.91
