## Efficient Low Noise Naive Bayes

In [17]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from nltk.corpus import stopwords

### Load data

In [18]:
data = pd.read_csv('dataset/spam.csv', encoding='latin-1')

In [19]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [20]:
# Drop some unwanted columns that do not provide insight
#data_modified = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data_modified = data.loc[:, ~data.columns.str.contains('^Unnamed')]

In [21]:
data_modified = data_modified.rename(columns={"v1":"Label", "v2":"Text"})
data_modified.loc[(data_modified.Label == "spam") , 'Label'] = "SPAM"
data_modified.loc[(data_modified.Label == "ham") , 'Label'] = "HAM"
# Show last 5 in dataset
data_modified.tail(5)

Unnamed: 0,Label,Text
5567,SPAM,This is the 2nd time we have tried 2 contact u...
5568,HAM,Will Ì_ b going to esplanade fr home?
5569,HAM,"Pity, * was in mood for that. So...any other s..."
5570,HAM,The guy did some bitching but I acted like i'd...
5571,HAM,Rofl. Its true to its name


In [22]:
data_modified.Label.value_counts()

HAM     4825
SPAM     747
Name: Label, dtype: int64

### Data Preparation

Training Data

In [23]:
# first 4500 reviews
training_data = data_modified[4500:4972].reset_index(drop=True)
training_data_length = training_data.shape[0]
training_data.head()

Unnamed: 0,Label,Text
0,HAM,So wat's da decision?
1,HAM,Wot is u up 2 then bitch?
2,HAM,Stupid.its not possible
3,HAM,She told to hr that he want posting in chennai...
4,SPAM,Mobile Club: Choose any of the top quality ite...


In [24]:
training_data.Label.value_counts()

HAM     408
SPAM     64
Name: Label, dtype: int64

Shape of training data

In [27]:
print(training_data.shape)
print(training_data.Label.shape)

(472, 2)
(472,)


Testing data

In [28]:
# last 600 reviews
test_data = data_modified[4972:5572].reset_index(drop=True)
test_data_length = test_data.shape[0]
test_data.head()

Unnamed: 0,Label,Text
0,HAM,Oops I was in the shower when u called. Hey a ...
1,HAM,Aiyo u so poor thing... Then u dun wan 2 eat? ...
2,HAM,Yar... I tot u knew dis would happen long ago ...
3,HAM,You are gorgeous! keep those pix cumming :) th...
4,HAM,A boy was late 2 home. His father: \POWER OF F...


In [29]:
test_data.Label.value_counts()

HAM     524
SPAM     76
Name: Label, dtype: int64

Shape of Testing data

In [30]:
print(test_data.shape)
print(test_data.Label.shape)

(600, 2)
(600,)


### Building the Naive Bayes Classifier

### Calculate the Priors

These are simply the propbability of text having "POSITIVE" or "NEGATIVE" sentiment

### Calculate the Likelihood

These are simply the propability of text pointing to a particular class

### Clean Our Trainig Data

In [31]:
clean_data = pd.DataFrame(columns=['Label', 'Text'])
for i in range(training_data.shape[0]):
    text = ' '.join([word for word in training_data.Text[i].split(" ") if word not in stopwords.words("english")])
    sys.stdout.write("\rProgress:" + str(100 * i/float(training_data.shape[0]))[:4] + " %")
    clean_data.loc[i] = {'Label':training_data.Label[i], 'Text': text}

Progress:99.7 %

In [32]:
# split into train and test
from sklearn import cross_validation

data_train, data_test, labels_train, labels_test = cross_validation.train_test_split(clean_data.Text, clean_data.Label, 
                                                                                     test_size=0.2, random_state=40)

# text vectorization for text to numbers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

vectorizer = CountVectorizer()
data_train_transformed = vectorizer.fit_transform(data_train)
data_test_transformed  = vectorizer.transform(data_test)

# slim the data for training and testing
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(data_train_transformed, labels_train)
data_train_transformed = selector.transform(data_train_transformed).toarray()
data_test_transformed  = selector.transform(data_test_transformed).toarray()

In [33]:
class BernoulliNBClassifier(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        smoothing = 2 * self.alpha
        n_doc = np.array([len(i) + smoothing for i in separated])
        self.feature_prob_ = count / n_doc[np.newaxis].T
        return self

    def predict_log_proba(self, X):
        return [(np.log(self.feature_prob_) * x + np.log(1 - self.feature_prob_) * np.abs(x - 1))
                .sum(axis=1) + self.class_log_prior_ for x in X]

    def predict(self, X, num_label=True):
        score_mat = np.argmax(self.predict_log_proba(X), axis=1)
        # return vector of predictions
        return score_mat.reshape(X.shape[0])
        
    def digit_to_label(self, digits):
        # palette must be given in sorted order
        palette = [0, 1]
        # key gives the new values you wish palette to be mapped to
        key = np.array(["HAM", "SPAM"])
        labels = np.digitize(digits.ravel(), palette, right=True)
        # return umwrapped to labels
        return key[index]

    def accuracy_score(self, predictionsY, targetsY):
        data = {'Targets': labels_test, 'Predictions': predictions}
        df = pd.DataFrame(data)
        df.loc[(df.Predictions== 0) , 'Predictions'] = "HAM"
        df.loc[(df.Predictions == 1) , 'Predictions'] = "SPAM"
        # calculate and return accuracy as a percentage
        return df.loc[(df.Predictions== df.Targets)].shape[0] *100 / df.count()[0]

In [34]:
clf = BernoulliNBClassifier()
clf.fit(data_train_transformed, labels_train)
predictions = clf.predict(data_test_transformed, num_label=False)

print(clf.accuracy_score(predictions, labels_test))

96.8421052632
