In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import norm
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# (c)
class GaussianNaiveBayesClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.class_probs = {}
        self.mean = {}
        self.variance = {}

    def fit(self, X, y):
        unique_classes = np.unique(y)
        
        for class_label in unique_classes:
            class_indices = np.where(y == class_label)
            class_data = X[class_indices]
            
            self.class_probs[class_label] = len(class_data) / len(X)
            self.mean[class_label] = np.mean(class_data, axis=0)
            self.variance[class_label] = np.var(class_data, axis=0)

    def predict(self, X):
        predictions = []
        
        for sample in X:
            class_scores = {}
            
            for class_label in self.class_probs:
                class_prob = np.log(self.class_probs[class_label])
                class_scores[class_label] = class_prob + np.sum(np.log(norm.pdf(sample, self.mean[class_label], np.sqrt(self.variance[class_label]+1e-9))))
            
            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)
        
        return predictions

    def score(self, X, y):
        return accuracy_score(y, self.predict(X))


In [3]:
# (d)
full_data = pd.read_csv('banking.csv')
train_set, test_set = train_test_split(full_data, test_size=0.2, random_state=42)

train_data = train_set.drop("y", axis=1) # drop labels for training set
train_labels = train_set["y"].copy()

#### For the categorical columns, we need to stratify them to one-hot data so that our program can read these data.

In [4]:
# (e)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
    
# let's now combine the numerical and categorical pipelines
num_attribs = ['age','duration','campaign','pdays','previous','emp_var_rate','cons_price_idx',
               'cons_conf_idx','euribor3m','nr_employed']
cat_attribs = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(sparse=False)),
    ])

# and concatenate them with FeatureUnion class
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# this is the final transformation result!
train_prepared = full_pipeline.fit_transform(train_data)

In [5]:
# (f)
# Naive Bayes
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("NB", GaussianNaiveBayesClassifier())
    ])
# A full pipeline with both preparation and prediction
full_pipeline_with_predictor.fit(train_data, train_labels)

test_data = test_set.drop("y", axis=1) # drop labels for training set
test_labels = test_set["y"].copy()
print('Accuracy of Naive Bayes: ', full_pipeline_with_predictor.score(test_data,test_labels))

Accuracy of Naive Bayes:  0.8148822529740228


In [6]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

full_pipeline_with_predictor2 = Pipeline([
        ("preparation", full_pipeline),
        ('classifier', LinearDiscriminantAnalysis())
    ])

full_pipeline_with_predictor2.fit(train_data, train_labels)
print('Accuracy of LDA: ', full_pipeline_with_predictor2.score(test_data,test_labels))

Accuracy of LDA:  0.9051954357853847


In [7]:
# (g)
numerical_data = full_data[num_attribs]
numerical_data.corr()

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
age,1.0,-0.000866,0.004594,-0.034369,0.024365,-0.000371,0.000857,0.129372,0.010767,-0.017725
duration,-0.000866,1.0,-0.071699,-0.047577,0.02064,-0.027968,0.005312,-0.008173,-0.032897,-0.044703
campaign,0.004594,-0.071699,1.0,0.052584,-0.079141,0.150754,0.127836,-0.013733,0.135133,0.144095
pdays,-0.034369,-0.047577,0.052584,1.0,-0.587514,0.271004,0.078889,-0.091342,0.296899,0.372605
previous,0.024365,0.02064,-0.079141,-0.587514,1.0,-0.420489,-0.20313,-0.050936,-0.454494,-0.501333
emp_var_rate,-0.000371,-0.027968,0.150754,0.271004,-0.420489,1.0,0.775334,0.196041,0.972245,0.90697
cons_price_idx,0.000857,0.005312,0.127836,0.078889,-0.20313,0.775334,1.0,0.058986,0.68823,0.522034
cons_conf_idx,0.129372,-0.008173,-0.013733,-0.091342,-0.050936,0.196041,0.058986,1.0,0.277686,0.100513
euribor3m,0.010767,-0.032897,0.135133,0.296899,-0.454494,0.972245,0.68823,0.277686,1.0,0.945154
nr_employed,-0.017725,-0.044703,0.144095,0.372605,-0.501333,0.90697,0.522034,0.100513,0.945154,1.0


#### We can see that many variables are highly correlated with each other, which violates the assumption of Naive Bayes Classifier. Therefore, here LDA has better performance than Naive Bayes.