#  Naive Bayes Classifiers

In [5]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

## Naive Bayes
### Using Naive Bayes to predict spam

In [6]:
dataset = pd.read_csv("spambase.data.csv")
dataset.head()
dataset.shape

(4600, 58)

In [7]:
dataset.head()

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.40,0.41,0.42,0.778,0.43,0.44,3.756,61,278,1
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1


In [8]:
X = dataset.iloc[:,0:48]
y = dataset.iloc[:, -1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

In [13]:
from sklearn.metrics import confusion_matrix
BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
print(BernNB)


y_pred = BernNB.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

BernoulliNB(alpha=1.0, binarize=True, class_prior=None, fit_prior=True)
0.815546772069
[[734 187]
 [ 93 504]]
             precision    recall  f1-score   support

          0       0.89      0.80      0.84       921
          1       0.73      0.84      0.78       597

avg / total       0.83      0.82      0.82      1518



In [15]:
MultiNB = MultinomialNB()

MultiNB.fit(X_train, y_train)
print(MultiNB)

y_pred = MultiNB.predict(X_test)
accuracy_score(y_test, y_pred)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


0.86034255599472986

In [16]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

y_pred = GausNB.predict(X_test)
accuracy_score(y_test, y_pred)

GaussianNB(priors=None)


0.78722002635046118

In [18]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)
print(BernNB)

y_pred = BernNB.predict(X_test)
accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


array([[857,  64],
       [130, 467]])

P(Q|F) = P(F|Q)*P(Q)/P(F)

In [None]:
df = pd.read_csv("spam.csv",encoding='latin-1')

In [None]:
df.head()

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.1, 
    random_state=42)

In [None]:
print (data_train[:10])

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

In [None]:
data_train_transformed = vectorizer.fit_transform(data_train)
data_test_transformed  = vectorizer.transform(data_test)

In [None]:
print(data_train_transformed[:10])

In [None]:
# slim the data for training and testing
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(data_train_transformed, labels_train)
data_train_transformed = selector.transform(data_train_transformed).toarray()
data_test_transformed  = selector.transform(data_test_transformed).toarray()

In [None]:
print (data_train_transformed[:10])

In [None]:
clf = GaussianNB()
clf.fit(data_train_transformed, labels_train)
predictions = clf.predict(data_test_transformed)

print (accuracy_score(labels_test, predictions))

In [None]:
NewEmail = pd.Series(["Hi there, For are premium phone services call 08718711108"],index=[8000])
NewEmail

In [None]:
NewEmail_transformed = vectorizer.transform(NewEmail)
NewEmail_transformed  = selector.transform(NewEmail_transformed).toarray()
clf.predict(NewEmail_transformed)

In [None]:
clf2 = GaussianNB()
clf2.fit(data_train, labels_train)
predictions = clf.predict(data_test)

print(accuracy_score(labels_test, predictions))

In [None]:
labels_train[:20]

In [None]:
#assigning predictor and target variables
x= np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

In [None]:
model = GaussianNB()

# Train the model using the training sets 
model.fit(x, Y)

#Predict Output 
predicted= model.predict([[1,2],[2,7]])
print (predicted)