## Name : Pratik Sontakke
## College : Ajeenkya DY Patil University
## Task 4 : Email Spam Detection

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
# Load the data
ps=  pd.read_csv('spam.csv', encoding = "ISO-8859-1")
ps

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
ps1=ps[['v1', 'v2']]
ems=ps1.rename(columns={'v1':'Output','v2':'Mail'})
ems

Unnamed: 0,Output,Mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#basic check
ems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Output  5572 non-null   object
 1   Mail    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
ems.describe()

Unnamed: 0,Output,Mail
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
#preprocessing
print(ems.notnull().sum()) 

Output    5572
Mail      5572
dtype: int64


In [7]:
ems.duplicated().sum()

403

In [8]:
ems.drop_duplicates(inplace=True)

In [9]:
#new shape
ems.shape

(5169, 2)

In [10]:
ems.isnull().sum()

Output    0
Mail      0
dtype: int64

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def process_text(Mail):
    # remove punctuation and useless words and return clean text
    nopun =[char for char in Mail if char not in string.punctuation]   #no punctuations
    nopun =''.join(nopun)
    
    #clean words
    clw = [word for word in nopun.split() if word.lower() not in stopwords.words('english')]
    
    #return clean words
    return clw    

In [13]:
ems['Mail'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Mail, dtype: object

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
msg_bag= CountVectorizer(analyzer=process_text).fit_transform(ems['Mail'])

In [15]:
# split data
from sklearn.model_selection import train_test_split
xtrain , xtest , ytrain , ytest = train_test_split(msg_bag,ems['Output'],test_size=0.30,random_state=0)

In [16]:
msg_bag.shape

(5169, 11304)

## Naive Bayes Classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB
classified=MultinomialNB().fit(xtrain,ytrain)

In [18]:
#prediction
print(classified.predict(xtrain))

print(ytrain.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [19]:
#Evaluation
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
pred=classified.predict(xtrain)
print(classification_report(ytrain,pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3174
        spam       0.99      0.98      0.98       444

    accuracy                           1.00      3618
   macro avg       0.99      0.99      0.99      3618
weighted avg       1.00      1.00      1.00      3618



In [20]:
print('Confusion Matrix:\n', confusion_matrix(ytrain,pred))

Confusion Matrix:
 [[3168    6]
 [   9  435]]


In [21]:
print('Accuracy:\n',accuracy_score(ytrain,pred))

Accuracy:
 0.9958540630182421


In [22]:
print(classified.predict(xtest))

print(ytest.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [23]:
pred=classified.predict(xtest)
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

         ham       0.99      0.96      0.97      1342
        spam       0.79      0.92      0.85       209

    accuracy                           0.96      1551
   macro avg       0.89      0.94      0.91      1551
weighted avg       0.96      0.96      0.96      1551



In [24]:
print('Confusion Matrix:\n', confusion_matrix(ytest,pred))

Confusion Matrix:
 [[1291   51]
 [  17  192]]


In [25]:
print('Accuracy:\n',accuracy_score(ytest,pred))

Accuracy:
 0.9561573178594455
