# **Import libraries**

In [294]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# **1. Read data**

In [295]:
train_data = pd.read_csv('train.csv')
train_data.head()
#Chưa biết các cột split, message id với subject có quan trọng không

Unnamed: 0.1,Unnamed: 0,Message ID,Subject,Message,Spam/Ham,split
0,0,0,christmas tree farm pictures,,ham,0.038415
1,1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,0.696509
2,2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,0.587792
3,3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,-0.055438
4,5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,-0.419658


In [296]:
len(train_data)

27284

In [297]:
val_data = pd.read_csv('val.csv')
val_data.head()

Unnamed: 0.1,Unnamed: 0,Message ID,Subject,Message,Spam/Ham,split
0,23,23,miscellaneous,- - - - - - - - - - - - - - - - - - - - - - fo...,ham,-0.351998
1,24,24,re : purge of old contract _ event _ status,fyi - what do you all think ?\n- - - - - - - -...,ham,0.257704
2,32,32,valero 8018 and 1394,it is my understanding the outages valero incu...,ham,0.0912
3,37,37,01 / 00 natural gas nomination,enron methanol company nominates the following...,ham,-1.745133
4,43,43,re : misc . questions,- - - - - - - - - - - - - - - - - - - - - - fo...,ham,-1.911987


In [298]:
len(val_data)

3084

# **2. Pre-processing**

### 1. Kiểm tra kiểu dữ liệu của đặc trưng

In [299]:
train_data.dtypes

Unnamed: 0      int64
Message ID      int64
Subject        object
Message        object
Spam/Ham       object
split         float64
dtype: object

### 2. Loại bỏ những cột dữ liệu không liên quan

In [300]:
train_data = train_data.drop(['Unnamed: 0', 'split', 'Message ID'], axis=1)
val_data = val_data.drop(['Unnamed: 0', 'split', 'Message ID'], axis=1)
train_data.head()

Unnamed: 0,Subject,Message,Spam/Ham
0,christmas tree farm pictures,,ham
1,"vastar resources , inc .","gary , production from the high island larger ...",ham
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham
3,re : issue,fyi - see note below - already done .\nstella\...,ham
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham


### 3. Loại bỏ giá trị trùng lặp nhau và chứa giá trị không xác định 

In [301]:
def preprocess(data):
    data = data.drop_duplicates(subset='Message')
    data = data.drop_duplicates(subset='Subject')
    data = data.dropna()
    data['Text'] = data['Subject'] + ' ' + data['Message']
    data.drop(['Subject', 'Message'], axis=1, inplace=True)
    data['spam'] = data['Spam/Ham'].apply(lambda x: 1 if x == 'spam' else 0)
    return data


In [302]:
train_data = preprocess(train_data)
print(len(train_data))
val_data = preprocess(val_data)
print(len(val_data))

19719
2749


In [303]:
val_data.head()

Unnamed: 0,Spam/Ham,Text,spam
0,ham,miscellaneous - - - - - - - - - - - - - - - - ...,0
1,ham,re : purge of old contract _ event _ status fy...,0
2,ham,valero 8018 and 1394 it is my understanding th...,0
3,ham,01 / 00 natural gas nomination enron methanol ...,0
4,ham,re : misc . questions - - - - - - - - - - - - ...,0


In [304]:
train_data[['Spam/Ham', 'Text']].groupby('Spam/Ham').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Spam/Ham,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,10617,10617,"vastar resources , inc . gary , production fro...",1
spam,9102,9102,dobmeos with hgh my energy level has gone up !...,1


# **3. Train model**

In [305]:
x_train, x_test, y_train, y_test = train_test_split(train_data.Text, train_data.spam, test_size=0.25)

In [306]:
#Build feature vectors from emails
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)
val_cv = cv.transform(val_data.Text)

In [307]:
model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [308]:
#Predict on dev data
test_pred = model.predict(x_test_cv)
print("Accuracy = %.2f%%" % (accuracy_score(y_test, test_pred) * 100))

Accuracy = 98.76%


In [309]:
#Predict on validation data/test data
val_pred = model.predict(val_cv)
print("Accuracy = %.2f%%" % (accuracy_score(val_data.spam, val_pred) * 100))

Accuracy = 98.91%


Note:
- Mô hình BernoulliNB đạt accuracy = 93.16% trên test set và 92.72% trên dev set.
- Mô hình MultinomialNB đạt accuracy = 98.87% trên test set và 98.72% trên dev set.

# **4. Experiments**

In [310]:
email = ["Hello, I hope you are doing well. This is a reminder for our meeting tomorrow at 10 AM. Please let me know if you need to reschedule. Best, John"]
spam_email = ["Congratulations! You've won a $1,000 gift card. Click here to claim your prize now!"]

print("spam" if model.predict(cv.transform(email))[0] == 1 else "ham")
print("spam" if model.predict(cv.transform(spam_email))[0] == 1 else "ham")

ham
spam
