In [1]:
import pandas as pd
import numpy as np

In [2]:
#read in the data of bank marketing after cleaning the data in Excel
bank = pd.read_csv('bank-clean.csv')

#first 5 rows of the dataframe
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,58,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,44,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,33,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,47,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,33,unknown,single,unknown,no,no,no,unknown,may,unknown,no


In [3]:
#delete the numeric columns
bank_cat = bank.drop("age",1)

#first 5 rows of the dataframe
bank_cat.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin

#call my class DiscreteNB and inherit from sklearn.base.BaseEstimator and sklearn.base.ClassifierMixin
class DiscreteNB(BaseEstimator, ClassifierMixin):
    
    def __init__(self):
        return None
    
    #split the dataset into train set and test set
    def SplitData(self,df,size,random):
        dfNP=np.array(df)
        X = dfNP[:,:dfNP.shape[1]-1].tolist()
        y = dfNP[:,dfNP.shape[1]-1].tolist()
        new_df=pd.DataFrame({'X':X,"y":y})
        X_train1, X_test1, y_train1, y_test1 = train_test_split(new_df.X, new_df.y, test_size = size, random_state=random)
        X_train=np.array(X_train1.tolist())
        X_test=np.array(X_test1.tolist())
        y_train=y_train1.tolist()
        y_test=y_test1.tolist()
        
        return X_train, X_test, y_train, y_test
    
    
    #define the fit function
    def fit(self, X_train, y_train):
        
        #summary of the label of dataset
        self.labels = np.unique(y_train).tolist()
        
        #summary of the features
        self.features=[]
        for i in range(len(X_train[0])):
            self.features.append(np.unique(X_train[:,i]).tolist())
    
        
        #probability of the label
        self.P_y = {}
        for label in self.labels:
            self.P_y[label] = y_train.count(label)/float(len(y_train))
        
        #probability of meeting both the label and the features
        self.P_xy = {}
        for label in self.P_y.keys():
            y_index = [i for i, result in enumerate(y_train) if result == label]
            for i in range(len(X_train[0])):
                for j in self.features[i]:
                    x_index = [k for k, feature in enumerate(X_train[:,i]) if feature == j]
                    xy_count = len(set(x_index) & set(y_index))
                    pkey = str(j) + '*' + str(label)
                    self.P_xy[pkey] = (xy_count) / float(len(y_train))
        
        #conditional probability
        self.P = {}
        for y in self.P_y.keys():
            for i in range(len(X_train[0])):
                for j in self.features[i]:
                    pkey = str(j) + '|' + str(y)
                    self.P[pkey] = self.P_xy[str(j)+'*'+str(y)] / float(self.P_y[y])#P[X1/Y] = P[X1Y]/P[Y]
                
        return self
    
    
    #define predict function
    def predict(self, X_test, y=None):
        predict_label=[]
        
        #preict the label of test data
        self.F = {}
        for i in range(len(X_test)):
            Test_features = X_test[i]
            for y in self.P_y.keys():
                self.F[y] = self.P_y[y]
                for j in Test_features:
                    self.F[y] = self.F[y]*self.P[str(j)+'|'+str(y)]
                        #P[y|X] = P[X|y]*P[y]/P[X]
                        #P[X] doesn't change
                        #F = P[X/y]*P[y] = P[x1/Y]*P[x2/Y]*P[x3|y]*P[x4|y]...*P[y]
            self.features_label = max(self.F, key=self.F.get)
            predict_label.append(self.features_label)
        return predict_label      
        
    def score(self,predict_label,y_test):
        correct = 0
        for i in range(len(y_test)):
                if predict_label[i] == y_test[i]:
                    correct = correct+1
        print(str("The score of the model is ")+str(correct / float(len(y_test))))
        
        

In [5]:
#use DiscreteNB() on the categorical bank dataset
nb=DiscreteNB()
X_train1, X_test1, y_train1, y_test1=nb.SplitData(bank_cat,0.2,10)
nb.fit(X_train1,y_train1)
predict_result1 = nb.predict(X_test1)
nb.score(predict_result1,y_test1) 

The score of the model is 0.8915183014486343


# Extra

In [6]:
#Add equal width discretization technique to class and build a new class
class NBClassifier(DiscreteNB):
    
    def __init__(self):
        return None
    
    def EqualWidDis(self,df,K):
        cat_index=[]
        for i in range(df.shape[1]):
            a = df.iloc[:,i]
            if a.dtypes == "int64":
                cat_index.append(i)
            
        for i in cat_index:
            DisOneFeature = pd.cut(df.iloc[:,i], K, labels=range(1, K+1))
            DisOneFeature = DisOneFeature.tolist()
            cal_name=df.columns.values[i]
            new_df = df.drop(cal_name,1)
            new_df.insert(i,str(cal_name)+"_discrete",DisOneFeature)
        
        return new_df
    
    #split the dataset into train set and test set
    def SplitData(self,df,size,random):
        dfNP=np.array(df)
        X = dfNP[:,:dfNP.shape[1]-1].tolist()
        y = dfNP[:,dfNP.shape[1]-1].tolist()
        new_df=pd.DataFrame({'X':X,"y":y})
        X_train1, X_test1, y_train1, y_test1 = train_test_split(new_df.X, new_df.y, test_size = size, random_state=random)
        X_train=np.array(X_train1.tolist())
        X_test=np.array(X_test1.tolist())
        y_train=y_train1.tolist()
        y_test=y_test1.tolist()
        
        return X_train, X_test, y_train, y_test
    
    
    #define the fit function
    def fit(self, X_train, y_train):
        
        #summary of the label of dataset
        self.labels = np.unique(y_train).tolist()
        
        #summary of the features
        self.features=[]
        for i in range(len(X_train[0])):
            self.features.append(np.unique(X_train[:,i]).tolist())
    
        
        #probability of the label
        self.P_y = {}
        for label in self.labels:
            self.P_y[label] = y_train.count(label)/float(len(y_train))
        
        #probability of meeting both the label and the features
        self.P_xy = {}
        for label in self.P_y.keys():
            y_index = [i for i, result in enumerate(y_train) if result == label]
            for i in range(len(X_train[0])):
                for j in self.features[i]:
                    x_index = [k for k, feature in enumerate(X_train[:,i]) if feature == j]
                    xy_count = len(set(x_index) & set(y_index))
                    pkey = str(j) + '*' + str(label)
                    self.P_xy[pkey] = (xy_count) / float(len(y_train))
        
        #conditional probability
        self.P = {}
        for y in self.P_y.keys():
            for i in range(len(X_train[0])):
                for j in self.features[i]:
                    pkey = str(j) + '|' + str(y)
                    self.P[pkey] = self.P_xy[str(j)+'*'+str(y)] / float(self.P_y[y])#P[X1/Y] = P[X1Y]/P[Y]
                
        return self
    
    
    #define predict function
    def predict(self, X_test, y=None):
        predict_label=[]
        
        #preict the label of test data
        self.F = {}
        for i in range(len(X_test)):
            Test_features = X_test[i]
            for y in self.P_y.keys():
                self.F[y] = self.P_y[y]
                for j in Test_features:
                    self.F[y] = self.F[y]*self.P[str(j)+'|'+str(y)]
                        #P[y|X] = P[X|y]*P[y]/P[X]
                        #P[X] doesn't change
                        #F = P[X/y]*P[y] = P[x1/Y]*P[x2/Y]*P[x3|y]*P[x4|y]...*P[y]
            self.features_label = max(self.F, key=self.F.get)
            predict_label.append(self.features_label)
        return predict_label      
        
    def score(self,predict_label,y_test):
        correct = 0
        for i in range(len(y_test)):
                if predict_label[i] == y_test[i]:
                    correct = correct+1
        print(str("The score of the model is ")+str(correct / float(len(y_test))))
        

In [7]:
#use subclass NBClassifier to get new dataset
nbc = NBClassifier()
new_df = nbc.EqualWidDis(bank,10)
new_df.head()

Unnamed: 0,age_discrete,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,6,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,4,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,4,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,2,unknown,single,unknown,no,no,no,unknown,may,unknown,no


In [8]:
#use NBClassifier() on the new dataset
X_train2, X_test2, y_train2, y_test2=nbc.SplitData(new_df,0.2,10)
nbc.fit(X_train2,y_train2)
predict_result2 = nbc.predict(X_test2)
nbc.score(predict_result2,y_test2) 

The score of the model is 0.8870949906004645
