# **Import libraries**

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
import joblib


# **1. Read data**

In [18]:
train_data = pd.read_csv('train.csv')
train_data.head()


Unnamed: 0.1,Unnamed: 0,Message ID,Subject,Message,Spam/Ham,split
0,0,0,christmas tree farm pictures,,ham,0.038415
1,1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,0.696509
2,2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,0.587792
3,3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,-0.055438
4,5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,-0.419658


In [19]:
val_data = pd.read_csv('val.csv')
val_data.head()

Unnamed: 0.1,Unnamed: 0,Message ID,Subject,Message,Spam/Ham,split
0,23,23,miscellaneous,- - - - - - - - - - - - - - - - - - - - - - fo...,ham,-0.351998
1,24,24,re : purge of old contract _ event _ status,fyi - what do you all think ?\n- - - - - - - -...,ham,0.257704
2,32,32,valero 8018 and 1394,it is my understanding the outages valero incu...,ham,0.0912
3,37,37,01 / 00 natural gas nomination,enron methanol company nominates the following...,ham,-1.745133
4,43,43,re : misc . questions,- - - - - - - - - - - - - - - - - - - - - - fo...,ham,-1.911987


In [20]:
print(train_data.shape)
print(val_data.shape)

(27284, 6)
(3084, 6)


# **2. Pre-processing**

### 1. Kiểm tra kiểu dữ liệu của đặc trưng

In [21]:
train_data.dtypes

Unnamed: 0      int64
Message ID      int64
Subject        object
Message        object
Spam/Ham       object
split         float64
dtype: object

### 2. Loại cột dữ liệu không liên quan, loại bỏ giá trị trùng lặp nhau, giá trị không xác định

In [22]:
def preprocess(data):
    # data = data.drop_duplicates(subset=['Subject', 'Message'])
    data = data[['Subject','Message','Spam/Ham']]
    data = data.drop_duplicates(subset=['Message'])
    data = data.drop_duplicates(subset=['Subject'])
    data = data[~(data['Subject'].isnull() & data['Message'].isnull())]
    data = data.fillna('')
    data['Text'] = data['Subject'] + ' ' + data['Message']
    data.drop(['Subject', 'Message'], axis=1, inplace=True)
    data['spam'] = data['Spam/Ham'].apply(lambda x: 1 if x == 'spam' else 0)
    return data

In [23]:
train_data = preprocess(train_data)
print(train_data.shape)
val_data = preprocess(val_data)
print(val_data.shape)

(19721, 3)
(2751, 3)


In [24]:
train_data.head()

Unnamed: 0,Spam/Ham,Text,spam
0,ham,christmas tree farm pictures,0
1,ham,"vastar resources , inc . gary , production fro...",0
2,ham,calpine daily gas nomination - calpine daily g...,0
3,ham,re : issue fyi - see note below - already done...,0
4,ham,"mcmullen gas for 11 / 99 jackie ,\nsince the i...",0


In [25]:
val_data.head()

Unnamed: 0,Spam/Ham,Text,spam
0,ham,miscellaneous - - - - - - - - - - - - - - - - ...,0
1,ham,re : purge of old contract _ event _ status fy...,0
2,ham,valero 8018 and 1394 it is my understanding th...,0
3,ham,01 / 00 natural gas nomination enron methanol ...,0
4,ham,re : misc . questions - - - - - - - - - - - - ...,0


### 4. Kiểm tra các sample đã unique hay chưa

In [26]:
train_data[['Spam/Ham', 'Text']].groupby('Spam/Ham').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Spam/Ham,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,10618,10618,christmas tree farm pictures,1
spam,9103,9103,dobmeos with hgh my energy level has gone up !...,1


# **3. Train model**

### 1. Vector hoá dữ liệu

In [27]:
x_train, x_val, y_train, y_val = train_data.Text, val_data.Text, train_data.spam, val_data.spam

In [28]:
#Build feature vectors from emails
cv = CountVectorizer(ngram_range=(1,2))
x_train_cv = cv.fit_transform(x_train)
x_val_cv = cv.transform(x_val)

In [14]:
print(x_train_cv.shape)
print(x_val_cv.shape)

(19721, 1233930)
(2751, 1233930)


### 2. Chọn siêu tham số

In [16]:
#Đã thử nghiệm với nhiều tham số khác nhau và đúc kết được tham số tốt nhất 
parameters = [{
    'vect__ngram_range': [(1, 2)], 
    'mnb__alpha': [(0.1)]
}]

### 3. Tạo pipeline

In [29]:
pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('mnb', MultinomialNB())
])
grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(x_train, y_train)

### 3. Huấn luyện mô hình

In [30]:
complete_pipeline = grid_search.fit(x_train, y_train)

### 4. Dự đoán trên tập val

In [31]:
y_val_pred = complete_pipeline.predict(x_val)
print("Accuracy = %.2f%%" % (accuracy_score(y_val, y_val_pred) * 100))
print(classification_report(y_val, y_val_pred, digits=4))

Accuracy = 99.45%
              precision    recall  f1-score   support

           0     0.9951    0.9944    0.9948      1433
           1     0.9939    0.9947    0.9943      1318

    accuracy                         0.9945      2751
   macro avg     0.9945    0.9946    0.9945      2751
weighted avg     0.9945    0.9945    0.9945      2751



### 5. Lưu model

In [32]:
joblib.dump(complete_pipeline, 'spam_pipeline.pkl')

['spam_pipeline.pkl']

# **4. Experiments**

### 1. Thử nghiệm với vài ví dụ cơ bản

In [33]:
email = ["Hello, I hope you are doing well. This is a reminder for our meeting tomorrow at 10 AM. Please let me know if you need to reschedule. Best, John"]
spam_email = ["Congratulations! You've won a $1,000 gift card. Click here to claim your prize now!"]

print("spam" if complete_pipeline.predict(email)[0] == 1 else "ham")
print("spam" if complete_pipeline.predict(spam_email)[0] == 1 else "ham") 

ham
spam


### 2. Tạo chức năng cho người dùng nhập mail trực tiếp

Chức năng này được cài đặt ở file app.py