In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
df['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df['spam'].value_counts()

0    4327
1    1368
Name: spam, dtype: int64

In [7]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [8]:
df.isnull().sum()

text    0
spam    0
dtype: int64

# separate x and y

In [9]:
x = df.text.values
y = df.spam.values

# Split dataset

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
xtrain,xtest,ytrain, ytest=train_test_split(x,y,test_size=0.2)

# Data Preprocessing

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer()
x_train = cv.fit_transform(xtrain)

In [14]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# ML algorithm

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
model = MultinomialNB()
model.fit(x_train,ytrain)

In [18]:
x_test = cv.transform(xtest)

In [19]:
x_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
model.score(x_test,ytest)

0.9912203687445127

In [21]:
emails = ['hey i am lokking for machine learning tutorial in bengali language', 'hey you win an iphone x giveaway for free please do the survey']

In [22]:
cv_emails = cv.transform(emails)

In [23]:
model.predict(cv_emails)

array([0, 1], dtype=int64)


# Preformance

In [24]:
pred = model.predict(x_test)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(ytest,pred)

0.9912203687445127

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion_matrix(ytest,pred)

array([[858,   7],
       [  3, 271]], dtype=int64)

In [29]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(ytest,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       865
           1       0.97      0.99      0.98       274

    accuracy                           0.99      1139
   macro avg       0.99      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

