In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## DATA PREPORATION

In [2]:
data_raw=pd.read_csv('SMSSpamCollection.csv',header=-1, delimiter="\t", quoting=3,names=['labels','data'])

In [3]:
data_raw.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# encode labels
data_raw['labels'] = data_raw['labels'].map({'spam':1,'ham':0})

In [5]:
#Transform text to vec
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_raw.data)
data=pd.DataFrame(data=X.toarray(),columns=vectorizer.get_feature_names())


In [6]:
data['labels']=data_raw['labels']

In [7]:
X_train, X_test = train_test_split(data, test_size=0.33, random_state=42)

## CHECK SKYLEARN BAYES FIRST

In [8]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train.iloc[:,:-1], X_train.iloc[:,-1])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
y_true = X_test.iloc[:,-1].values
y_pred = naive_bayes.predict_proba(X_test.iloc[:,:-1]).argmax(axis=1)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1585
          1       0.93      0.96      0.94       255

avg / total       0.98      0.98      0.98      1840



## CREATE OWN BAYES

In [21]:
class my_bayes_v1(object):
    def __init__(self):
        print ('NOTE:INPUT DATA HAS TO BE PANDAS DATA FRAME TYPE!!!')
        print ('Labeled data has to be a last column and call: labels')
        print ('Alpha=0.001')
        self.P_w_class = 0
        self.P_class = 0
        self.alfa=0.001
    def fit(self,X_train):
        #split classes
        Clas=[X_train[X_train['labels']==0].iloc[:,:-1],
              X_train[X_train['labels']==1].iloc[:,:-1]]
        X_train=X_train.iloc[:,:-1]
        #P(clas)
        P_class=[Clas[0].shape[0]/X_train.shape[0],
                 Clas[1].shape[0]/X_train.shape[0]]
        #P_w_class
        P_w_class=[self.alfa+Clas[0].sum()/(self.alfa*X_train.sum().sum()+Clas[0].sum().sum()),
                   self.alfa+Clas[1].sum()/(self.alfa*X_train.sum().sum()+Clas[1].sum().sum())]
        self.P_w_class=P_w_class
        self.P_class=P_class
    def proba(self,X):
        prob=[self.P_w_class[0][X>0].product()*self.P_class[0],
              self.P_w_class[1][X>0].product()*self.P_class[1]]
        #requlize prob
        prob=[prob[0]/(prob[1]+prob[0]),prob[1]/(prob[1]+prob[0])]
        return prob
    def predict_proba(self,X):
        n=[]
        if np.array(X.shape).shape[0]==1:
            X=pd.DataFrame(data=X.values.reshape(1,X.shape[0]),columns=X.index)
        for i in range(0,X.shape[0]):
            n.append(self.proba(X.iloc[i,:]))
        return np.array(n)
    def predict(self,X):
        prob=self.predict_proba(X)
        a=np.array([(prob[:,0]==prob.max(axis=1))+0,
           (prob[:,1]==prob.max(axis=1))+0])
        return a.T


In [16]:
a=my_bayes_v1()

NOTE:INPUT DATA HAS TO BE PANDAS DATA FRAME TYPE!!!
Labeled data has to be a last column and call: labels
Alpha=0.001


In [17]:
a.alfa=0.00001

In [18]:
a.fit(X_train)

In [19]:
y_true = X_test.iloc[:,-1].values
y_pred = a.predict(X_test.iloc[:,:-1]).argmax(axis=1)
res=f1_score(y_true, y_pred, average='micro')
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1585
          1       0.96      0.93      0.95       255

avg / total       0.99      0.99      0.99      1840



## Best F1 score ~98, custom bayes resuls are near to skylearn bayes results 