# Email Spam Detection with Machine Learning

__Importing libraries__

In [2]:
import numpy as np
import pandas as pd

__Loading Dataset__

In [7]:
df = pd.read_csv("spam.csv", encoding='Latin-1')

In [9]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [13]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [18]:
df.value_counts

<bound method DataFrame.value_counts of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN   

In [19]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

__Replacing Null values__

In [20]:
df = df.where((pd.notnull(df)), '')

In [21]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [22]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [23]:
df = df.rename(columns={'v1' : 'Category', 'v2' : 'Messages'})

In [24]:
df.head()

Unnamed: 0,Category,Messages,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


__Label Encoding__

In [26]:
from sklearn.preprocessing import LabelEncoder

In [28]:
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

__Spltting Data__

In [29]:
x = df['Messages']
y = df['Category']

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=50, test_size=0.2)

__Feature Extraction__

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
features_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_features = features_extraction.fit_transform(x_train)
x_test_features = features_extraction.transform(x_test)

In [38]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

__Model Training__

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
lo = LogisticRegression()
lo.fit(x_train_features,y_train)

__Evaluating__

In [42]:
from sklearn.metrics import accuracy_score

In [43]:
prediction = lo.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)

In [44]:
accuracy

0.9690374691496523

__Predicting New Value__

In [49]:
mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take help for granted and will fulfill my promise. "]

In [51]:
features = features_extraction.transform(mail)

In [54]:
predictions = lo.predict(features)
print(predictions)

if (predictions[0]==1):
    print('Not Spam')
else:
    print('Spam')

[0]
Spam
