#GMAIL_SPAM_DETECTION

By - Sakshi Verma

Mounting the google drive

In [None]:
#mounting the google drive to load the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importing Libraries

In [None]:
#importing required libraries
import pandas as pd
import numpy as np
import io
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Loading the dataset

In [None]:
#loading the gmail spam dataset
import chardet
with open('/content/drive/MyDrive/spam.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.7270322499829184, 'encoding': 'Windows-1252', 'language': ''}

In [None]:
data = pd.read_csv('/content/drive/MyDrive/spam.csv',encoding='ISO-8859-1')

Processing the dataset

In [None]:
#Looking into the dataset
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
#dropping the columns which are not required
data.drop(data.iloc[:,2:5],inplace=True,axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Renaming the columns as type and text
data.rename(columns = {'v1':'type','v2':'text'}, inplace = True)
data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# making spam as 1 and ham as 0
data['spam'] = data['type'].map({'spam':1,'ham':0}).astype(int)
data.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#looking at columns
print("Cloumns of the given data")
for col in data.columns:
  print(col)

Cloumns of the given data
type
text
spam


In [None]:
#checking on number of rows
a = len(data['type'])
print("Number of rows in review column : ",a)
a = len(data['text'])
print("Number of rows in liked column : ",a)

Number of rows in review column :  5572
Number of rows in liked column :  5572


In [None]:
#Tokenization
data['text'][1]

'Ok lar... Joking wif u oni...'

In [None]:
def tokenizer(text):
  return text.split()

In [None]:
data['text'] = data['text'].apply(tokenizer)

data['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [None]:
data['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [None]:
porter = SnowballStemmer("english",ignore_stopwords = False)

def stem_it(text):
  return [porter.stem(word) for word in text]

data['text'] = data['text'].apply(stem_it)
data['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [None]:
#lemmitization
data['text'][150]

['the', 'wine', 'is', 'flow', 'and', "i'm", 'i', 'have', 'nevering..']

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmit_it(text):
  return [lemmatizer.lemmatize(word,pos='a') for word in text]

data['text'] = data['text'].apply(lemmit_it)

In [None]:
data['text'][150]

['the', 'wine', 'is', 'flow', 'and', "i'm", 'i', 'have', 'nevering..']

In [None]:
#stop word removal
data['text'][217]

['easi', 'ah?sen', 'got', 'select', 'mean', 'it', 'good..']

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_word = stopwords.words('english')

def stop_it(text):
  review = [word for word in text if not word in stop_word]
  return review

data['text'] = data['text'].apply(stop_it)
data['text'][150]

['wine', 'flow', "i'm", 'nevering..']

In [None]:
data.head()

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [None]:
data['text'] = data['text'].apply(' '.join)
data.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


In [None]:
tfidf = TfidfVectorizer()
y = data.spam.values
x = tfidf.fit_transform(data['text'])

In [None]:
#Splitting the dataset for training and testing
x_train, x_text, y_train, y_test = train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

In [None]:
#LOGISTIC REGRESSION
# fit the training dataset
clf = LogisticRegression()
clf.fit(x_train,y_train)
# predict the labels on validation dataset
y_pred = clf.predict(x_text)
# Use accuracy_score function to get the accuracy
acc_log = accuracy_score(y_pred,y_test)*100

In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
# fit the training dataset  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy") 
classifier.fit(x_train, y_train) 
# predict the labels on validation dataset
y_pred= classifier.predict(x_text) 
# Use accuracy_score function to get the accuracy  
acc_Rf = accuracy_score(y_pred,y_test)*100

In [None]:
#KNeighborsClassifier 
from sklearn.neighbors import KNeighborsClassifier 
# fit the training dataset 
classifier1= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 ) 
classifier1.fit(x_train, y_train) 
# predict the labels on validation dataset
y_pred= classifier1.predict(x_text)  
# Use accuracy_score function to get the accuracy
acc_knn = accuracy_score(y_pred,y_test)*100

In [None]:
#NAIVE_BAYES
from sklearn import model_selection, naive_bayes, svm
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_train,y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(x_text)
# Use accuracy_score function to get the accuracy
acc_nb = accuracy_score(predictions_NB, y_test)*100

In [None]:
#SVM
# fit the training dataset on the classifiercola
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_text)
# Use accuracy_score function to get the accuracy
acc_svm = accuracy_score(predictions_SVM, y_test)*100

In [None]:
#Looking into all the accuracies and comparing them
print("Logistic Accuracy Score               ",acc_log)
print("RandomForestClassifier Accuracy Score ",acc_Rf)
print("KNeighborsClassifier Accuracy Score   ",acc_knn)
print("Naive Bayes Accuracy Score            ",acc_nb)
print("SVM Accuracy Score                    ",acc_svm)

Logistic Accuracy Score                96.05381165919282
RandomForestClassifier Accuracy Score  97.13004484304932
KNeighborsClassifier Accuracy Score    90.85201793721973
Naive Bayes Accuracy Score             96.7713004484305
SVM Accuracy Score                     97.66816143497758


SVM performed the best!!