**Nithin Rosarieo**

**Oasis Internship**

**Project : EMail Spam Detection**

**Importing Libraries**

In [266]:
import numpy as np
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

The dataset contains characters that are not encoded in UTF-8, which is the
 default encoding pandas assumes when reading the CSV files. So specifying a different encoding when reading the CSV file

In [267]:
try:
    data = pd.read_csv('spam.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        data = pd.read_csv('spam.csv', encoding='latin1')
    except UnicodeDecodeError:
        data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

data.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [268]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [269]:
mail_data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis = 1)
mail_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Spam mail - 1

Ham mail (not spam) - 0

In [270]:
 mail_data.columns = ['Spam','Mail']
 mail_data.head()

Unnamed: 0,Spam,Mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [271]:
mail_data.isnull().sum()

Spam    0
Mail    0
dtype: int64

**Label Encoding**

In [272]:
le=LabelEncoder()
mail_data['Spam']=le.fit_transform(mail_data['Spam'])
mail_data.head()

Unnamed: 0,Spam,Mail
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [273]:
print(mail_data)

      Spam                                               Mail
0        0  Go until jurong point, crazy.. Available only ...
1        0                      Ok lar... Joking wif u oni...
2        1  Free entry in 2 a wkly comp to win FA Cup fina...
3        0  U dun say so early hor... U c already then say...
4        0  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567     1  This is the 2nd time we have tried 2 contact u...
5568     0              Will Ì_ b going to esplanade fr home?
5569     0  Pity, * was in mood for that. So...any other s...
5570     0  The guy did some bitching but I acted like i'd...
5571     0                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [275]:
mail_data.shape

(5572, 2)

**Preprocessing Data**

In [276]:
mail_data['Mail'] = mail_data['Mail'].str.lower()
mail_data['Mail'] = mail_data['Mail'].fillna('').apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
mail_data['Mail'] = mail_data['Mail'].apply(lambda x: re.sub(r'\s+', ' ', x))
mail_data.head()

Unnamed: 0,Spam,Mail
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [277]:
stopwords = stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [278]:
mail_data['Mail'] = mail_data['Mail'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
mail_data.head()

Unnamed: 0,Spam,Mail
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


**Features and Targets**

In [279]:
x = mail_data['Mail']
y = mail_data['Spam']

In [280]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [281]:
X_train

1642                             sleeping nt feeling well
2899                 come aftr ltdecimalgt cleaning house
480                                      almost see u sec
3485                                yeah probably earlier
157     hello love get interview today happy good boy ...
                              ...                        
905                    hey whats charles sorry late reply
5192       oh oh den muz change plan liao go back yan jiu
3980                        huh cant thk oredi many pages
235                         printed oh ltgt come upstairs
5157                                         k k sms chat
Name: Mail, Length: 4457, dtype: object

**Logistic Regression**

In [282]:
pipeline = Pipeline([
    ('countvect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression())])

pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
accuracy = pipeline.score(X_test, Y_test)
print('Accuracy:', accuracy * 100)
print(classification_report(Y_test, y_pred))

Accuracy: 96.8609865470852
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       976
           1       0.97      0.77      0.86       139

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



**Decision Tree Classifier**

In [283]:
pipeline = Pipeline([
    ('countvect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', DecisionTreeClassifier())])

pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
accuracy = pipeline.score(X_test, Y_test)
print('Accuracy:', accuracy * 100)
print(classification_report(Y_test, y_pred))

Accuracy: 95.87443946188341
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       976
           1       0.84      0.82      0.83       139

    accuracy                           0.96      1115
   macro avg       0.91      0.90      0.90      1115
weighted avg       0.96      0.96      0.96      1115



**Random Forest Classifier**

In [284]:
pipeline = Pipeline([
    ('countvect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())])

pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
accuracy = pipeline.score(X_test, Y_test)
print('Accuracy:', accuracy * 100)
print(classification_report(Y_test, y_pred))

Accuracy: 98.20627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       976
           1       1.00      0.86      0.92       139

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



**Performance of models**

In [288]:
import plotly.express as px

models = ['LR', 'DTR', 'RFR']
accuracies = [0.968, 0.958, 0.982]
df = {'Model': models, 'Accuracy': accuracies}
df = pd.DataFrame(df)
fig = px.bar(df, x='Model', y='Accuracy', color='Model', title='Accuracy of Models',template='plotly_dark')
fig.update_layout(width=1000, height=500)
fig.show()

**Samples**

In [285]:
out = pd.DataFrame({"Actual output" :  Y_test, "Predicted output" : y_pred})
Result = mail_data.merge(out, left_index =True, right_index = True)
Result[['Mail','Actual output','Predicted output']].sample(20)

Unnamed: 0,Mail,Actual output,Predicted output
575,1000 cash 2000 prize claim call09050000327,1,1
453,ok tell stay yeah tough optimistic things impr...,0,0
3705,reading gud habit nan bari hudgi yorge pataist...,0,0
4008,ha must walk everywhere cannot take tram cousi...,0,0
3529,tyler getting 8th leave long 9 get like hour,0,0
1296,sure driving reach destination soon,0,0
4530,wish things different wonder able show much va...,0,0
3343,oh great ill disturb talk,0,0
2457,kkhow sister kids,0,0
4081,check rooms befor activities,0,0
