In [1]:
import pandas as pd
import numpy as np

In [2]:
df_new = pd.read_csv('spam_clean_csv')

In [3]:
df_new.shape

(5169, 6)

In [4]:
df_new.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df_new.head()

Unnamed: 0,target,num_characters,num_words,num_sentences,transformed_text
0,0,111,24,2,go jurong point crazy available bugis n great ...
1,0,29,8,2,ok lar joking wif u oni
2,1,155,37,2,free entry 2 wkly comp win fa cup final tkts 2...
3,0,49,13,1,u dun say early hor u c already say
4,0,61,15,1,nah think go usf life around though


### Implementing Naive Bayes, Multinomial NB and Bernoulli NB

In [6]:
# using BOW, Tfidf

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [7]:
df_new.isna().sum()

target              0
num_characters      0
num_words           0
num_sentences       0
transformed_text    9
dtype: int64

In [8]:
df_new['transformed_text'] = df_new['transformed_text'].fillna(' ')

In [9]:
#X = cv.fit_transform(df_new['transformed_text']).toarray()

In [10]:
X = tfidf.fit_transform(df_new['transformed_text']).toarray()

In [11]:
X.shape

(5169, 3000)

In [12]:
y = df_new['target'].values

In [13]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=2)

In [16]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [17]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [18]:
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test) 
print(accuracy_score(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))
print(precision_score(y_test, y_pred1))

0.8829787234042553
[[800  96]
 [ 25 113]]
0.5406698564593302


In [19]:
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test) 
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2))

0.9738878143133463
[[896   0]
 [ 27 111]]
1.0


In [20]:
bnb.fit(X_train, y_train)
y_pred3 = bnb.predict(X_test) 
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3))

0.9845261121856866
[[896   0]
 [ 16 122]]
1.0


In [21]:
# Using Tfidf with Multinomial NB gives us better precision

In [22]:
# optimizing model-hyperparamter tuning when using tfidf and setting max_features=3000

In [23]:
import pickle

In [24]:
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))

In [25]:
pickle.dump(mnb, open('model.pkl', 'wb'))

# Handling Imbalanced Dataset

In [26]:
from imblearn.combine import SMOTEENN

In [27]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X, y)

In [28]:
X_trainb, X_testb, y_trainb, y_testb = train_test_split(X_resampled, y_resampled , test_size=0.2, random_state=2)

In [29]:
mnb.fit(X_trainb, y_trainb)
y_pred2b = mnb.predict(X_testb) 
print(accuracy_score(y_testb, y_pred2b))
print(confusion_matrix(y_testb, y_pred2b))
print(precision_score(y_testb, y_pred2b))

0.9468186134852802
[[114  56]
 [  0 883]]
0.9403620873269436


In [30]:
gnb.fit(X_trainb, y_trainb)
y_pred1b = gnb.predict(X_testb) 
print(accuracy_score(y_testb, y_pred1b))
print(confusion_matrix(y_testb, y_pred1b))
print(precision_score(y_testb, y_pred1b))

0.9895536562203229
[[164   6]
 [  5 878]]
0.9932126696832579


In [31]:
bnb.fit(X_trainb, y_trainb)
y_pred3b = bnb.predict(X_testb) 
print(accuracy_score(y_testb, y_pred3b))
print(confusion_matrix(y_testb, y_pred3b))
print(precision_score(y_testb, y_pred3b))

0.9876543209876543
[[170   0]
 [ 13 870]]
1.0


In [32]:
pickle.dump(bnb, open('vectorizer.pkl', 'wb'))