### Spam Classifier Using Bernoulli Naive Bayes

The Dataset for identifying spam is collected from the UCI machine learning repository
https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [None]:
#importing libraries

import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc

In [None]:
#Reading the data
docs = pd.read_table('SMSSpamCollection.txt', header=None, names=['Class', 'sms'])
docs.head()

In [None]:
#df.column_name.value_counts() - gives no. of unique inputs in that columns
docs.Class.value_counts()

In [None]:
ham_spam=docs.Class.value_counts()
ham_spam

In [None]:
print("Spam % is ",(ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100)

In [None]:
# mapping labels to 1 and 0
docs['label'] = docs.Class.map({'ham':0, 'spam':1})

In [None]:
docs.head()

In [None]:
X=docs.sms
y=docs.label

In [None]:
X = docs.sms
y = docs.label
print(X.shape)
print(y.shape)

In [None]:
# splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_train.head()

In [None]:
# vectorising the text
vect = CountVectorizer(stop_words='english')

In [None]:
vect.fit(X_train)

In [None]:
vect.vocabulary_

In [None]:
vect.get_feature_names()

In [None]:
# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)

In [None]:
# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


In [None]:
bnb

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

In [None]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

In [None]:
specificity = TN / float(TN + FP)

print("specificity",specificity)

In [None]:
precision = TP / float(TP + FP)

print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

In [None]:
y_pred_proba

In [None]:

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
print (roc_auc)

In [None]:
print(true_positive_rate)

In [None]:
print(false_positive_rate)

In [None]:
print(thresholds)

In [None]:
%matplotlib inline  
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate)