In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("email.csv", sep="|")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Email Subject,Email Body,Label,Unnamed: 4
0,,------------------------------------------------,----------------------------------------------...,----------,
1,,Urgent: Claim Your Prize Now!,Congratulations! You've won a million dollars...,Spam,
2,,Invitation to the Annual Conference,"Dear valued member, You're invited to attend ...",Not Spam,
3,,Exclusive Offer: 50% Off Today Only,Limited time offer! Get 50% off on selected i...,Spam,
4,,Reminder: Your Appointment Tomorrow,Just a friendly reminder that you have an app...,Not Spam,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column                                                                             Non-Null Count  Dtype  
---  ------                                                                             --------------  -----  
 0   Unnamed: 0                                                                         0 non-null      float64
 1    Email Subject                                                                     22 non-null     object 
 2    Email Body                                                                        22 non-null     object 
 3    Label                                                                             22 non-null     object 
 4   Unnamed: 4                                                                         0 non-null      float64
dtypes: float64(2), object(3)
memory usage: 1012.0+ bytes


In [5]:
df.columns = ["A", "B", "C", "D", "E"]

In [6]:
df.head()

Unnamed: 0,A,B,C,D,E
0,,------------------------------------------------,----------------------------------------------...,----------,
1,,Urgent: Claim Your Prize Now!,Congratulations! You've won a million dollars...,Spam,
2,,Invitation to the Annual Conference,"Dear valued member, You're invited to attend ...",Not Spam,
3,,Exclusive Offer: 50% Off Today Only,Limited time offer! Get 50% off on selected i...,Spam,
4,,Reminder: Your Appointment Tomorrow,Just a friendly reminder that you have an app...,Not Spam,


In [7]:
df.drop(["A", "B", "E"], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,C,D
0,----------------------------------------------...,----------
1,Congratulations! You've won a million dollars...,Spam
2,"Dear valued member, You're invited to attend ...",Not Spam
3,Limited time offer! Get 50% off on selected i...,Spam
4,Just a friendly reminder that you have an app...,Not Spam


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   C       22 non-null     object
 1   D       22 non-null     object
dtypes: object(2)
memory usage: 484.0+ bytes


In [10]:
df.tail()

Unnamed: 0,C,D
17,Your account needs verification. Click here t...,Spam
18,Help us celebrate Jane's birthday this weeken...,Not Spam
19,You've been selected as our grand prize winne...,Spam
20,Enjoy exclusive discounts at our sale event t...,Not Spam
21,Your account security may be compromised. Tak...,Not Spam


In [11]:
df["C"]=np.array([str(df["C"][i]).strip() for i in range(len(df))])
# str().strip()
# To save or store numpy array use df["C"]=
# spam and not spam is converted by reset and extra space is removed by drop 

In [12]:
df["D"]=np.array([str(df["D"][i]).strip() for i in range(len(df))])

In [13]:
df.columns = ["Email", "Label"]

In [14]:
df.drop(0, axis=0, inplace=True)

In [15]:
df.reset_index(drop=True, inplace=True)

In [16]:
len(df["Email"][0])

68

In [17]:
len(df["Label"][0])

4

In [18]:
df

Unnamed: 0,Email,Label
0,Congratulations! You've won a million dollars!...,Spam
1,"Dear valued member, You're invited to attend o...",Not Spam
2,Limited time offer! Get 50% off on selected it...,Spam
3,Just a friendly reminder that you have an appo...,Not Spam
4,Make thousands of dollars from home with our r...,Spam
5,"Join us for a delightful evening of food, musi...",Not Spam
6,Your account information needs to be updated. ...,Not Spam
7,You're our lucky winner! Click here to claim y...,Spam
8,This is your last chance to avail of our speci...,Spam
9,Help us make a difference. Join our charity ev...,Not Spam


In [19]:
X = df["Email"]
y = df["Label"]

In [20]:
X

0     Congratulations! You've won a million dollars!...
1     Dear valued member, You're invited to attend o...
2     Limited time offer! Get 50% off on selected it...
3     Just a friendly reminder that you have an appo...
4     Make thousands of dollars from home with our r...
5     Join us for a delightful evening of food, musi...
6     Your account information needs to be updated. ...
7     You're our lucky winner! Click here to claim y...
8     This is your last chance to avail of our speci...
9     Help us make a difference. Join our charity ev...
10    Learn the secrets to increasing your income wi...
11    Stay updated with our weekly newsletter featur...
12    Congratulations! You've won a free vacation to...
13    We've updated our policies. Please review the ...
14    Buy one item, get the second one free! Hurry, ...
15    Join our upcoming webinar on the future of tec...
16    Your account needs verification. Click here to...
17    Help us celebrate Jane's birthday this wee

In [21]:
y

0         Spam
1     Not Spam
2         Spam
3     Not Spam
4         Spam
5     Not Spam
6     Not Spam
7         Spam
8         Spam
9     Not Spam
10        Spam
11    Not Spam
12        Spam
13    Not Spam
14        Spam
15    Not Spam
16        Spam
17    Not Spam
18        Spam
19    Not Spam
20    Not Spam
Name: Label, dtype: object

In [22]:
df["Label"].value_counts()

Label
Not Spam    11
Spam        10
Name: count, dtype: int64

In [23]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.8, random_state=35)

In [24]:
vectors = CountVectorizer()

In [25]:
X_train = vectors.fit_transform(X_tr)

In [26]:
X_test = vectors.transform(X_te)

In [27]:
X_test.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [28]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
model = MultinomialNB()

In [30]:
model.fit(X_train, y_tr)

In [31]:
model.score(X_train, y_tr)

1.0

In [32]:
y_pred = model.predict(X_test)

In [33]:
accuracy_score(y_pred, y_te)

1.0

In [34]:
print(classification_report(y_pred, y_te))

              precision    recall  f1-score   support

    Not Spam       1.00      1.00      1.00         3
        Spam       1.00      1.00      1.00         2

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [35]:
text = np.array(["Hello, there congratulation! you have won the new mahindra Thar 4X4"])

In [36]:
text_vec = vectors.transform(text)

In [37]:
text_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]],
      dtype=int64)

In [38]:
model.predict(text_vec)

array(['Spam'], dtype='<U8')

<!-- Naive Bayes has 3 Types

* Gaussian Naive Bayes - NLTK Preprocessed data. Data with word frequencies and the 
frequency is normally distributed.

* Multinomial Naive Bayes - It is general purpose model can be used when the features
has word counts or vectors.

* Bernoulli Naive Bayes - If output column has 0 or 1 for either True or False, Spam not
Spam like this or binary labels.

Naive Bayes has 3 Types

* Gaussian Naive Bayes - NLTK Preprocessed data. Data with word frequencies and the frequency is normally distributed.
  
* Multinomial Naive Bayes - It is general purpose model can be used when the features has word counts or vectors.
  
* Bernoulli Naive Bayes - If output column has 0 or 1 for either True or False, Spam not Spam like this or binary labels.