<a href="https://colab.research.google.com/github/Saikiran-git/Spam-Email-Detection/blob/main/Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine Learning algorithm for Email Spam Detection**

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("spam.csv")

## **Data Cleaning**

In [3]:
dataset.sample(5)

Unnamed: 0,Category,Message
4387,ham,", im .. On the snowboarding trip. I was wonder..."
2995,ham,"No idea, I guess we'll work that out an hour a..."
3651,ham,"We are hoping to get away by 7, from Langport...."
2776,ham,We confirm eating at esplanade?
1466,spam,YOU 07801543489 are guaranteed the latests Nok...


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
dataset.shape

(5572, 2)

In [6]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()

In [7]:
dataset['Category'] = labelEncoder.fit_transform(dataset['Category'])
dataset.sample(10)

Unnamed: 0,Category,Message
1345,0,Were somewhere on Fredericksburg
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5560,0,Anything lor. Juz both of us lor.
990,0,Ugh. Gotta drive back to sd from la. My butt i...
4993,0,I'm hungry buy smth home...
1898,0,"Aight, sounds good. When do you want me to com..."
1055,0,HIYA COMIN 2 BRISTOL 1 ST WEEK IN APRIL. LES G...
2410,0,Aww that's the first time u said u missed me w...
405,0,"Yep, the great loxahatchee xmas tree burning o..."
1749,0,Feel Yourself That You Are Always Happy.. Slow...


In [8]:
dataset.duplicated().sum()

415

In [9]:
dataset = dataset.drop_duplicates()

In [10]:
dataset.duplicated().sum()

0

In [11]:
print(dataset['Category'].value_counts())
print("\nValues in percentage:")
print(dataset['Category'].value_counts(normalize=True)*100)

0    4516
1     641
Name: Category, dtype: int64

Values in percentage:
0    87.570293
1    12.429707
Name: Category, dtype: float64


## **Data Preprocessing**

In [12]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
porterStemmer = PorterStemmer()
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def preprocessing(line):
  line = line.lower()
  line = nltk.word_tokenize(line)
  temp = []
  for iter in line:
    if iter.isalnum():
      temp.append(iter)
  line = temp[:]
  temp.clear()
  for iter in line:
    if iter not in stopwords.words('english') and iter not in string.punctuation:
      temp.append(iter)
  line = temp[:]
  temp.clear()
  for iter in line:
    temp.append(porterStemmer.stem(iter))

  return str(" ").join(temp)

In [14]:
dataset['Processed_Message'] = dataset['Message'].apply(preprocessing)

In [15]:
dataset.sample(10)

Unnamed: 0,Category,Message,Processed_Message
3199,0,7 lor... Change 2 suntec... Wat time u coming?,7 lor chang 2 suntec wat time u come
4940,0,Tomarrow i want to got to court. At &lt;DECIM...,tomarrow want got court lt decim gt come bu st...
74,0,U can call me now...,u call
2218,0,* Will have two more cartons off u and is very...,two carton u pleas shelv
1637,0,"No shit, but I wasn't that surprised, so I wen...",shit surpris went spent even french guy met to...
2446,0,The guy (kadeem) hasn't been selling since the...,guy kadeem sell sinc break know one guy parano...
4408,0,"Awesome, plan to get here any time after like ...",awesom plan get time like lt gt text detail we...
2485,0,Only if you promise your getting out as SOON a...,promis get soon text morn let know made ok
3643,0,My house here e sky quite dark liao... If rain...,hous e sky quit dark liao rain got excus 2 run...
18,0,Fine if thats the way u feel. Thats the way ...,fine way u feel way gota b


In [16]:
from collections import Counter

In [17]:
spam_corpus = []
for msg in dataset[dataset['Category'] == 1]['Processed_Message'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [18]:
len(spam_corpus)

9726

In [19]:
pd.DataFrame(Counter(spam_corpus).most_common(30))

Unnamed: 0,0,1
0,call,311
1,free,184
2,2,154
3,txt,139
4,text,122
5,ur,119
6,u,115
7,mobil,110
8,stop,108
9,repli,103


In [20]:
ham_corpus = []
for msg in dataset[dataset['Category'] == 0]['Processed_Message'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [21]:
pd.DataFrame(Counter(ham_corpus).most_common(30))

Unnamed: 0,0,1
0,u,885
1,go,404
2,get,351
3,2,288
4,gt,288
5,lt,287
6,come,275
7,got,239
8,like,236
9,know,235


## **Model**

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [23]:
X = vectorizer.fit_transform(dataset['Processed_Message']).toarray()
X.shape

(5157, 6702)

In [24]:
  y = dataset['Category'].values

In [25]:
print(X.shape)
print(y.shape)
print(X)
print(y)

(5157, 6702)
(5157,)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0 0 1 ... 0 0 0]


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [43]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import time

In [44]:
gaussianNB = GaussianNB()
multinomialNB = MultinomialNB()
bernoulliNB = BernoulliNB()
knc = KNeighborsClassifier()
svc = SVC(kernel='sigmoid', gamma=1.0)

In [45]:
accuracy_scores = []
precision_scores = []
time_taken = []
classifiers = ['Gaussian NB', 'Multinomial NB', 'Bernoulli NB', 'SVC', 'KNC']

In [46]:
def train_test_classifier(classification):
  start = time.process_time()
  classification.fit(X_train, y_train)
  y_pred = classification.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  confusion = confusion_matrix(y_test, y_pred)
  time_taken.append(time.process_time() - start)
  accuracy_scores.append(accuracy)
  precision_scores.append(precision)
  return accuracy, precision, confusion

In [47]:
train_test_classifier(gaussianNB)

(0.8701550387596899, 0.5061728395061729, array([[775, 120],
        [ 14, 123]]))

In [48]:
train_test_classifier(multinomialNB)

(0.9544573643410853, 1.0, array([[895,   0],
        [ 47,  90]]))

In [49]:
train_test_classifier(bernoulliNB)

(0.9689922480620154, 0.9411764705882353, array([[888,   7],
        [ 25, 112]]))

In [50]:
train_test_classifier(svc)

(0.9709302325581395, 0.9908256880733946, array([[894,   1],
        [ 29, 108]]))

In [51]:
train_test_classifier(knc)

(0.9031007751937985, 1.0, array([[895,   0],
        [100,  37]]))

In [53]:
pd.DataFrame({'Classifier':classifiers, 'Accuracy':accuracy_scores, 'Precision':precision_scores, 'Process Time':time_taken}).sort_values(by=['Precision','Accuracy', 'Process Time'], ascending=False)

Unnamed: 0,Classifier,Accuracy,Precision,Process Time
1,Multinomial NB,0.954457,1.0,0.195075
4,KNC,0.903101,1.0,70.77678
3,SVC,0.97093,0.990826,50.093299
2,Bernoulli NB,0.968992,0.941176,0.459153
0,Gaussian NB,0.870155,0.506173,0.415425
