In [32]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

In [33]:
data_ini=pd.read_csv('SMSSpamCollection.csv',header=-1, delimiter="\t", quoting=3,names=['labels','data'])

In [34]:
data_ini.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
# labels
data_ini['labels'] = data_ini['labels'].map({'spam':1,'ham':0})


In [36]:
data_ini.head()


Unnamed: 0,labels,data
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
#text to vector
vector = CountVectorizer()
X = vector.fit_transform(data_ini.data)
data=pd.DataFrame(data=X.toarray(),columns=vector.get_feature_names())


In [38]:
data['labels']=data_ini['labels']

In [39]:
X_train, X_test,y_train,y_test = train_test_split(data.iloc[:,:-1],data.iloc[:,-1], test_size=0.33, random_state=42)

In [40]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
y_true = y_test.values
y_pred = naive_bayes.predict_proba(X_test).argmax(axis=1)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1585
          1       0.93      0.96      0.94       255

avg / total       0.98      0.98      0.98      1840



#### Sklearn bayes' f1 is 0,98. Now let's try to define our own bayes.

## My bayes

In [42]:
def bayes_fit(X_train,y_train,al):
#     P_class count
    X_train=np.array(X_train)
    X_class_0=X_train[y_train==0,:]
    X_class_1=X_train[y_train==1,:]
#     prob_class
    prob_0=X_class_0.sum(axis=0).sum()/(X_class_0.sum(axis=0).sum()+X_class_1.sum(axis=0).sum())
    prob_1=X_class_1.sum(axis=0).sum()/(X_class_0.sum(axis=0).sum()+X_class_1.sum(axis=0).sum())
#     prob_class_condition
    prob_cond_0=(al+X_class_0.sum(axis=0))/(al*X_class_0.sum(axis=0).sum()+X_class_1.sum(axis=0).sum()+X_class_0.sum(axis=0).sum())
    prob_cond_1=(al+X_class_1.sum(axis=0))/(al*X_class_0.sum(axis=0).sum()+X_class_1.sum(axis=0).sum()+X_class_1.sum(axis=0).sum())
    return prob_0,prob_1,prob_cond_0,prob_cond_1


In [43]:
a,b,c,d=bayes_fit(X_train,y_train,0.0003)


In [44]:
print (a.shape, b.shape, c.shape, d.shape)

() () (8713,) (8713,)


In [45]:
c

array([5.53824396e-09, 5.53824396e-09, 1.84663514e-05, ...,
       1.84663514e-05, 5.53824396e-09, 1.84663514e-05])

In [46]:
def median_prob(X_test, prob_0,prob_1,prob_cond_0,prob_cond_1):
    true=0
    prob_0=np.log(prob_cond_0[X_test>0].prod()*prob_0)
    prob_1=np.log(prob_cond_1[X_test>0].prod()*prob_1)
    if prob_0>prob_1:
        true=0
    else:
        true=1
    return true

In [47]:
X_test=np.array(X_test)


In [48]:
X_test[0,:]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
median_prob(X_test[0,:], a, b, c, d)

0

In [50]:
def predict_prob(X_train, X_test,y_train,al):
    pred=[]
    shape=X_test.shape[0]
    a, b, c, d=bayes_fit(X_train,y_train,al)
    for i in range(0, shape):
        pred.append(median_prob(X_test[i,:], a, b, c, d))
    return pred

In [60]:
y_true = y_test
y_pred = predict_prob(X_train, X_test,y_train,al=1)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      1585
          1       0.98      0.82      0.89       255

avg / total       0.97      0.97      0.97      1840



#### Our f1 is 0,97, that is very close to sklearn bayes (0,98)