# Email Spam Detection ML

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data

In [2]:
df=pd.read_csv('Email_spam.csv',encoding='ISO-8859-1')
df

Unnamed: 0,v1,v2,Unnamed: 2
0,ham,"Go until jurong point, crazy.. Available only ...",
1,ham,Ok lar... Joking wif u oni...,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,
3,ham,U dun say so early hor... U c already then say...,
4,ham,"Nah I don't think he goes to usf, he lives aro...",
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,
5568,ham,Will Ì_ b going to esplanade fr home?,
5569,ham,"Pity, * was in mood for that. So...any other s...",
5570,ham,The guy did some bitching but I acted like i'd...,


# Droping Column

In [3]:
df=df.drop(columns=['Unnamed: 2'])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Renameing Column

In [4]:
df = df.rename(columns={'v1':'Category','v2':'Message'})
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Basic EDA

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df['Message'].nunique()

5169

In [8]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# Feature Encoding

In [9]:
df['Category']=df['Category'].map({'ham':1,'spam':0})

# Selecting Independent(X) and Dependent Variable

In [10]:
X=df['Message']
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [11]:
y=df['Category']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

# Splitting data into Train and Test

In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
X_train

1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: Message, Length: 4457, dtype: object

In [14]:
X_test

3245    Funny fact Nobody teaches volcanoes 2 erupt, t...
944     I sent my scores to sophas and i had to do sec...
1044    We know someone who you know that fancies you....
2484    Only if you promise your getting out as SOON a...
812     Congratulations ur awarded either å£500 of CD ...
                              ...                        
4264     &lt;DECIMAL&gt; m but its not a common car he...
2439    Rightio. 11.48 it is then. Well arent we all u...
5556    Yes i have. So that's why u texted. Pshew...mi...
4205                               Get the door, I'm here
4293    Kit Strip - you have been billed 150p. Netcoll...
Name: Message, Length: 1115, dtype: object

In [15]:
y_train

1978    1
3989    0
3935    1
4078    1
4086    0
       ..
3772    1
5191    1
5226    1
5390    1
860     1
Name: Category, Length: 4457, dtype: int64

In [16]:
y_test

3245    1
944     1
1044    0
2484    1
812     0
       ..
4264    1
2439    1
5556    1
4205    1
4293    0
Name: Category, Length: 1115, dtype: int64

# Feature Extraction and Logistic Regression(Modelling)&Classification report

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#creat a pipeline for project
clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('logistic_reg',LogisticRegression())
])

#fit with X_train and y_train
clf.fit(X_train,y_train)

#get the prediction for X_test and store it in y_pred
y_pred=clf.predict(X_test)

#print the classification report
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.99      0.75      0.86       150
           1       0.96      1.00      0.98       965

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [18]:
X_test[:5]

3245    Funny fact Nobody teaches volcanoes 2 erupt, t...
944     I sent my scores to sophas and i had to do sec...
1044    We know someone who you know that fancies you....
2484    Only if you promise your getting out as SOON a...
812     Congratulations ur awarded either å£500 of CD ...
Name: Message, dtype: object

In [20]:
y_test[:5]

3245    1
944     1
1044    0
2484    1
812     0
Name: Category, dtype: int64

# Prediction

In [21]:
y_pred[:5]

array([1, 1, 1, 1, 0], dtype=int64)

# Random Forest & Classification Report

In [22]:
from sklearn.ensemble import RandomForestClassifier

#creat a pipeline for project
clf1=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('random_forest',RandomForestClassifier())
])

#fit with X_train and y_train
clf1.fit(X_train,y_train)

#get the prediction for X_test and store it in y_pred
y_pred=clf1.predict(X_test)

#print the classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90       150
           1       0.97      1.00      0.99       965

    accuracy                           0.98      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



# Email Detection based on Random forest model

In [24]:
emails=["Congratulations!You've won a $1000 Walmart gift card.Go to http://bit.ly/123456 tp claim now"
]

prediction=clf1.predict(emails)
print(prediction)
if (prediction[0]==1):
    print("Ham Email")
else:
    print("Spam Email")

#My final model is Random forest classifier because it gives 98% Accuracy    

[0]
Spam Email
