In [1]:
import pandas as pd
import sklearn
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("data/spam.csv", encoding="latin1")
print(data.head(4))

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  


In [3]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

# 1. Train test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['v2'], data['v1'], test_size=0.3, random_state=42)

In [5]:
print(X_train.shape)
print(X_test.shape)

(3900,)
(1672,)


# 2. Data Transformation

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_X = vectorizer.fit_transform(X_train).toarray() # transform from text data to count matrix
test_X = vectorizer.transform(X_test).toarray()

In [11]:
print(test_X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [12]:
y_train = y_train.map({'ham':0,'spam':1})
y_test = y_test.map({'ham':0,'spam':1})

# 3. Building a NB model

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(force_alpha=True)
# training phase
clf.fit(train_X,y_train)
clf.score(train_X,y_train) # accuracy of model on the training set

0.9948717948717949

In [14]:
# testing phase
preds = clf.predict(test_X)

In [15]:
preds

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

# 4. Model performance on the test set

In [17]:
report = classification_report(preds, y_test, digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.9966    0.9830    0.9897      1473
           1     0.8858    0.9749    0.9282       199

    accuracy                         0.9821      1672
   macro avg     0.9412    0.9790    0.9590      1672
weighted avg     0.9834    0.9821    0.9824      1672



In [7]:
# train accuracy : 99.48% - test accuracy : 98.21%

In [19]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEMO_ACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DEMO_ACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DEMO_ACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DEMO_ACCOUNT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
stop_words = stopwords.words('english')
import re
lemma = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
def pre_process(text):
    ## tokenize
    tokenized = word_tokenize(text)
    ## stopwords removal
    stw = [word for word in tokenized if word not in stop_words]
    ## lower
    lower = [word.lower() for word in stw]
    ## lemma
    result=[lemma.lemmatize(word) for word in lower]
    #### join to make a sentence
    results = " ".join(result)
    
    return results

In [22]:
X_train = X_train.map(pre_process)
X_test = X_test.map(pre_process) # preprocess train and test set
vectorizer = CountVectorizer() # create count matrix
train_X = vectorizer.fit_transform(X_train).toarray() # gồm 2 step: fit (tìm số từ trong dataset), transform (đếm tuần suất xuất hiện)
test_X = vectorizer.transform(X_test).toarray()
# refit the model
clf = MultinomialNB(force_alpha=True)
clf.fit(train_X,y_train)
print("Train acc: ",clf.score(train_X,y_train))
preds = clf.predict(test_X)
print(classification_report(y_test,preds,digits=4))

Train acc:  0.9951282051282051
              precision    recall  f1-score   support

           0     0.9857    0.9938    0.9897      1453
           1     0.9565    0.9041    0.9296       219

    accuracy                         0.9821      1672
   macro avg     0.9711    0.9490    0.9596      1672
weighted avg     0.9818    0.9821    0.9818      1672

