## Load NumPy, pandas and time

In [1]:
import numpy as np
import pandas as pd
import time
from collections import Counter #added python standard library: collections

In [2]:
def create_bins(df,nobins=10,bintype='equal-width'):
    cdf=df.copy()
    numerical_cdf=df.copy()
    for i in ['ID','CLASS']:
        if i in cdf: numerical_cdf=numerical_cdf.drop([i], axis=1)
    numerical_cdf=numerical_cdf.select_dtypes(include=['number'])
    binfunc=[pd.cut,pd.qcut]
    binning={}

    if bintype=="equal-width": a=binfunc[0]
    elif bintype=="equal-size": a=binfunc[1]
    else: a=binfunc[0]

    for i in numerical_cdf:
        res,bins=a(cdf[i],nobins,retbins=True,labels=False,duplicates='drop')
        bins[0]=-np.inf
        bins[-1]=np.inf
        binning[i]=bins
        cdf[i]=res.astype('category')
    return cdf,binning

def apply_bins(df,binning):
    cdf=df.copy()
    numerical_cdf=df.copy()
    for i in ['ID','CLASS']:
        numerical_cdf=numerical_cdf.drop([i], axis=1)
    numerical_cdf=numerical_cdf.select_dtypes(include=['number'])

    for i in numerical_cdf:
        bins=binning[i]
        res,bins=pd.cut(cdf[i],bins,retbins=True,labels=False,duplicates='drop')
        cdf[i]=res.astype('category')
    return cdf


def create_imputation(anneal_train_df):
    anneal_train_df_copy = anneal_train_df.copy()
    imputation_template = {}
    
    df_cols = anneal_train_df_copy.loc[:, ~anneal_train_df_copy.columns.isin(['ID', 'CLASS'])]
    numerical_columns = df_cols.select_dtypes(include=['number']).columns
    non_numerical_columns = df_cols.select_dtypes(include=['category', 'object']).columns

    anneal_train_df_numerical = anneal_train_df_copy.loc[:, numerical_columns]
    for column in anneal_train_df_numerical.columns:
        non_missing_col_data = anneal_train_df_numerical.loc[:,column]
        if(non_missing_col_data.dropna().shape[0] > 0):
            mean_value = non_missing_col_data.mean()
        else:
            mean_value = 0
            
        anneal_train_df_copy[column] = anneal_train_df_copy[column].fillna(mean_value)
        imputation_template[column] = mean_value

    anneal_train_df_categorical = anneal_train_df_copy.loc[:, non_numerical_columns]
    for column in anneal_train_df_categorical.columns:
        non_missing_col_data = anneal_train_df_categorical[column]
        if(non_missing_col_data.dropna().shape[0] > 0):
            mode_value = non_missing_col_data.mode()[0]
        else:
            mode_value = ""
        
        anneal_train_df_copy[column] = anneal_train_df_copy[column].fillna(mode_value)
        imputation_template[column] = mode_value
            
        
    return anneal_train_df_copy, imputation_template

def apply_imputation(anneal_test_df, imputation):
    anneal_test_df_copy = anneal_test_df.copy()
    df_cols = anneal_test_df_copy.loc[:, ~anneal_test_df_copy.columns.isin(['ID', 'CLASS'])]
    
    for column in df_cols.columns:
        anneal_test_df_copy[column] = anneal_test_df_copy[column].fillna(imputation[column])

    return anneal_test_df_copy
    


def create_normalization(file_df,normalizationtype):
    normalization = {}
    new_file_df = file_df.copy()
    
    if (normalizationtype == "minmax"):
        for col_name in file_df:
            if (col_name != "ID" and col_name != "CLASS"):
                min_col = file_df[col_name].min()
                max_col = file_df[col_name].max()
                normalization[col_name] = ("minmax", min_col,max_col)
                new_file_df[col_name] = file_df[col_name].apply(lambda x:(x-min_col)/(max_col-min_col))
        
    elif (normalizationtype == "zscore"):
        for col_name in file_df:
            if (col_name != "ID" and col_name != "CLASS"):
                mean_col = file_df[col_name].mean()
                std_col = file_df[col_name].std()
                normalization[col_name] = ("zscore", mean_col,std_col)
                new_file_df[col_name] = file_df[col_name].apply(lambda x:(x-mean_col)/std_col)
    
    return (new_file_df, normalization)

def apply_normalization(file_df,normalization):
    
    new_file_df = file_df.copy()
    
    for col_name in normalization:
        
        if (normalization[col_name][0] == "minmax"):
            min_col = normalization[col_name][1]
            max_col = normalization[col_name][2]
            new_file_df[col_name] = file_df[col_name].apply(lambda x:(x-min_col)/(max_col-min_col))
        
        elif (normalization[col_name][0] == "zscore"):
            mean_col = normalization[col_name][1]
            std_col = normalization[col_name][2]
            new_file_df[col_name] = file_df[col_name].apply(lambda x:(x-mean_col)/std_col)
    
    return (new_file_df)


def create_one_hot(df_file):
    one_hot_df = df_file.copy()
    one_hot_dict = {}
    for col_name in df_file:
        if (col_name != "CLASS" and col_name != "ID"):
            col = df_file[col_name]
            col_hot = pd.get_dummies(col,prefix = col_name, dtype = "float")
            one_hot_df = pd.concat([one_hot_df, col_hot], axis=1)
            one_hot_df = one_hot_df.drop(col_name, axis=1)
            one_hot_dict[col_name] = set(col)
    
    return (one_hot_df,one_hot_dict)


def apply_one_hot(df_file, one_hot): 
    
    #Gives zero column for categories who doesnt exist on the test df
    
    hot_df_file = df_file.copy()

    for col_name, col_unique_values  in one_hot.items():
        col = df_file[col_name]
        cat = col.astype(pd.api.types.CategoricalDtype(categories= col_unique_values))
        col_hot = pd.get_dummies(cat, prefix = col_name, dtype = "float")
        hot_df_file = pd.concat([hot_df_file, col_hot], axis=1)
        hot_df_file = hot_df_file.drop(col_name, axis=1)
                                    
    return (hot_df_file)

def accuracy(pred_df, label):
    
    score = 0
    for row in range(len(pred_df)):
        pred_row = pred_df.iloc[row,:]
        if pred_row.idxmax() == label[row]:
            score = score + 1
            
    return (score/len(pred_df))


def brier_score(df_file, correctlabels):
    correctlabels_df = pd.DataFrame(correctlabels)
    correctlabels_df = correctlabels_df.astype(pd.api.types.CategoricalDtype(categories= df_file.columns))
    correctlabels_hot = pd.get_dummies(correctlabels_df)

    score = []
    for row in range(np.size(df_file,0)):
        for col in range(np.size(df_file,1)):
            score_each_element = df_file.iloc[row,col]- correctlabels_hot.iloc[row,col]
            score_each_element = np.power(score_each_element, 2)
            score.append(score_each_element)

    brier_score = sum(score)/np.size(df_file,0)
    
    return(brier_score)


def TP_AND_FP(predictions_decoded, correct_labels, positive_label):
    """Takes decoded predictions and labels"""
 
    true_positives = 0
    false_positives = 0
    for prediction, label in zip(predictions_decoded.tolist(), correct_labels):
        if prediction == 1:
            if(label == positive_label):
                true_positives += 1
            else:
                false_positives += 1
           
    num_positives = sum([1 for label in correct_labels if label == positive_label])
    num_negatives = sum([1 for label in correct_labels if label != positive_label])
   
    return true_positives/num_positives, false_positives/num_negatives

def auc_calc_np(tp_fp_dict):
    unique_values = sorted( list(set([value for key, value in tp_fp_dict.items()])) )
    tpr = np.array([value[0] for value in unique_values])
    fpr = np.array([value[1] for value in unique_values])
 
    area = np.trapz(tpr, fpr)
    return area

def binary_auc(predictions, correct_labels, positive_label='A', num_thresholds=10):
    """preprocesses the data into binary predictions, positives vs rest(negatives)"""
    tp_fp_dict = {}
    without_boundaries = np.linspace(0, 1, num_thresholds)
    for threshold in without_boundaries:
        predictions_copy = predictions.copy()
        binary_positive_pred = predictions_copy.apply(lambda x: 1 if x[positive_label] >= threshold else 0, axis=1)
 
        tp_fp_dict[threshold] = TP_AND_FP(binary_positive_pred, correct_labels, positive_label)
 
    return auc_calc_np(tp_fp_dict)
 
def auc(predictions, correct_labels, num_thresholds=100):
    correct_labels = correct_labels.tolist() 
    label_frequencies = {x:correct_labels.count(x)/len(correct_labels) for x in correct_labels}
 
    total_auc = 0
    for label, label_frequency in label_frequencies.items():
        auc = binary_auc(predictions, correct_labels, positive_label=label, num_thresholds=num_thresholds)
        total_auc += auc*label_frequency
 
    return total_auc

## 2. Define the class NaiveBayes

In [8]:
class NaiveBayes:

    def __init__(self):
        self.binning=None
        self.class_priors=None
        self.feature_class_value_counts=None
        self.feature_class_counts=None

    def fit(self,data_frame,nobins=10,bintype="equal_width"):
        data_frame_discrete,self.binning=create_bins(data_frame,nobins,bintype)
        self.class_priors={}
        self.feature_class_value_counts={}
        self.feature_class_counts={}

        #calculates the class priors
        #and saves the results in self.class_priors
        for key,value in data_frame_discrete.groupby("CLASS")["CLASS"]:
            self.class_priors[key]=value.count()/data_frame_discrete["CLASS"].size


        for feature in data_frame_discrete.loc[:, ~data_frame_discrete.columns.isin(["ID",'CLASS'])]:
            feature_map={}

            #calculates the number of training instances with a specific combination of class and feature value
            #and saves the results in self.feature_class_value_counts
            for key,value in data_frame_discrete.groupby([feature,"CLASS"])[feature]:
                feature_map[key]=value.count()
            self.feature_class_value_counts[feature]=feature_map

            #calculates the number of training instances with a specific class value and any value for the feature
            #and saves the results in self.feature_class_counts
            for key,value in data_frame_discrete.groupby(["CLASS"])[feature]:
                self.feature_class_counts[(feature,key)]=value.count()

        return

    def predict(self, data_frame):
        #applys discretization
        data_frame_discrete=apply_bins(data_frame,self.binning)
        posterior_matrix=[]
        for index in range(len(data_frame_discrete)):
            posterior_list=[]
            likelihood={}
            conditional_statement=True
            evidence=0

            for class_value in sorted(data_frame_discrete.CLASS.unique()):
                likelihood_class=0

                for feature in data_frame_discrete.loc[:, ~data_frame_discrete.columns.isin(["ID",'CLASS'])]:
                    value=data_frame_discrete.loc[index,feature]

                    #calculates the relative frequency of the observed feature value given the class
                    if (value,class_value) in self.feature_class_value_counts[feature].keys():
                        likelihood_feature=(self.feature_class_value_counts[feature][(value,class_value)])/(self.feature_class_counts[feature,class_value])

                    # if the frequency of the observed feature value given the class is missing in
                    # the training set the likelihood is set to zero (missing = zero)
                    else:
                        likelihood_class=0
                        break

                    #calculates the likelihood iteratively
                    likelihood_class+=np.log(likelihood_feature)

                likelihood[class_value]=likelihood_class
                
                #calculates the evidence iteratively
                if likelihood_class!=0:
                    conditional_statement=False
                    evidence+=np.exp(likelihood_class+np.log(self.class_priors[class_value]))

            #calculates the probabilities for the classes given the observed data point
            for class_value in sorted(data_frame_discrete.CLASS.unique()):

                #If the sum of non-normalized probabilities is zero the probability(posterior) is set to the class prior
                if conditional_statement:
                    posterior=self.class_priors[class_value]

                #if the likelihood is zero then the posterior is zero
                elif likelihood[class_value]==0:
                    posterior=0

                #calculates the posterior for the other cases
                else:
                    posterior=np.exp(likelihood[class_value]+np.log(self.class_priors[class_value])-np.log(evidence))

                posterior_list+=[posterior]

            posterior_matrix+=[posterior_list]

        data_frame_predict=pd.DataFrame(posterior_matrix, columns = sorted(data_frame_discrete.CLASS.unique()))

        return  data_frame_predict

In [9]:
# Test your code (leave this part unchanged, except for if auc is undefined)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

nb_model = NaiveBayes()

test_labels = glass_test_df["CLASS"]

nobins_values = [3,5,10]
bintype_values = ["equal-width","equal-size"]
parameters = [(nobins,bintype) for nobins in nobins_values for bintype in bintype_values]

results = np.empty((len(parameters),3))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    nb_model.fit(glass_train_df,nobins=parameters[i][0],bintype=parameters[i][1])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    t0 = time.perf_counter()
    predictions = nb_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([nobins_values,bintype_values]),
                       columns=["Accuracy","Brier score","AUC"])

results


Training time (3, 'equal-width'): 0.13 s.
Testing time (3, 'equal-width'): 1.25 s.
Training time (3, 'equal-size'): 0.12 s.
Testing time (3, 'equal-size'): 1.38 s.
Training time (5, 'equal-width'): 0.15 s.
Testing time (5, 'equal-width'): 1.36 s.
Training time (5, 'equal-size'): 0.14 s.
Testing time (5, 'equal-size'): 1.07 s.
Training time (10, 'equal-width'): 0.16 s.
Testing time (10, 'equal-width'): 1.08 s.
Training time (10, 'equal-size'): 0.19 s.
Testing time (10, 'equal-size'): 1.15 s.


Unnamed: 0,Unnamed: 1,Accuracy,Brier score,AUC
3,equal-width,0.616822,0.622116,0.724856
3,equal-size,0.607477,0.554782,0.780478
5,equal-width,0.64486,0.551101,0.770186
5,equal-size,0.598131,0.581556,0.792174
10,equal-width,0.654206,0.527569,0.809678
10,equal-size,0.588785,0.741668,0.723134


In [10]:
train_labels = glass_train_df["CLASS"]
nb_model.fit(glass_train_df)
predictions = nb_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set: {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

Accuracy on training set: 0.85
AUC on training set: 0.96
Brier score on training set: 0.23


### Comment on assumptions, things that do not work properly, etc.

According to the Naive Bayes theorem. The likelihood of feature values given a class can be divided into products of likelihoods because we assume the features are independent. 

To avoid underflow. Instead of multiplying each likelihood, used logarithm and exponential to reverse in the posterior. 