## Load NumPy, pandas and time

In [1]:
import numpy as np
import pandas as pd
import time
import copy # for copying dictionaries, standard python library

In [2]:
def create_bins(df,nobins=10,bintype='equal-width'):
    cdf=df.copy()
    numerical_cdf=df.copy()
    for i in ['ID','CLASS']:
        if i in cdf: numerical_cdf=numerical_cdf.drop([i], axis=1)
    numerical_cdf=numerical_cdf.select_dtypes(include=['number'])
    binfunc=[pd.cut,pd.qcut]
    binning={}

    if bintype=="equal-width": a=binfunc[0]
    elif bintype=="equal-size": a=binfunc[1]
    else: a=binfunc[0]

    for i in numerical_cdf:
        res,bins=a(cdf[i],nobins,retbins=True,labels=False,duplicates='drop')
        bins[0]=-np.inf
        bins[-1]=np.inf
        binning[i]=bins
        cdf[i]=res.astype('category')
    return cdf,binning

def apply_bins(df,binning):
    cdf=df.copy()
    numerical_cdf=df.copy()
    for i in ['ID','CLASS']:
        numerical_cdf=numerical_cdf.drop([i], axis=1)
    numerical_cdf=numerical_cdf.select_dtypes(include=['number'])

    for i in numerical_cdf:
        bins=binning[i]
        res,bins=pd.cut(cdf[i],bins,retbins=True,labels=False,duplicates='drop')
        cdf[i]=res.astype('category')
    return cdf


def create_imputation(anneal_train_df):
    anneal_train_df_copy = anneal_train_df.copy()
    imputation_template = {}
    
    df_cols = anneal_train_df_copy.loc[:, ~anneal_train_df_copy.columns.isin(['ID', 'CLASS'])]
    numerical_columns = df_cols.select_dtypes(include=['number']).columns
    non_numerical_columns = df_cols.select_dtypes(include=['category', 'object']).columns

    anneal_train_df_numerical = anneal_train_df_copy.loc[:, numerical_columns]
    for column in anneal_train_df_numerical.columns:
        non_missing_col_data = anneal_train_df_numerical.loc[:,column]
        if(non_missing_col_data.dropna().shape[0] > 0):
            mean_value = non_missing_col_data.mean()
        else:
            mean_value = 0
            
        anneal_train_df_copy[column] = anneal_train_df_copy[column].fillna(mean_value)
        imputation_template[column] = mean_value

    anneal_train_df_categorical = anneal_train_df_copy.loc[:, non_numerical_columns]
    for column in anneal_train_df_categorical.columns:
        non_missing_col_data = anneal_train_df_categorical[column]
        if(non_missing_col_data.dropna().shape[0] > 0):
            mode_value = non_missing_col_data.mode()[0]
        else:
            mode_value = ""
        
        anneal_train_df_copy[column] = anneal_train_df_copy[column].fillna(mode_value)
        imputation_template[column] = mode_value
            
        
    return anneal_train_df_copy, imputation_template

def apply_imputation(anneal_test_df, imputation):
    anneal_test_df_copy = anneal_test_df.copy()
    df_cols = anneal_test_df_copy.loc[:, ~anneal_test_df_copy.columns.isin(['ID', 'CLASS'])]
    
    for column in df_cols.columns:
        anneal_test_df_copy[column] = anneal_test_df_copy[column].fillna(imputation[column])

    return anneal_test_df_copy


def accuracy(pred_df, label):
    
    score = 0
    for row in range(len(pred_df)):
        pred_row = pred_df.iloc[row,:]
        if pred_row.idxmax() == label[row]:
            score = score + 1
            
    return (score/len(pred_df))


def brier_score(df_file, correctlabels):
    correctlabels_df = pd.DataFrame(correctlabels)
    correctlabels_df = correctlabels_df.astype(pd.api.types.CategoricalDtype(categories= df_file.columns))
    correctlabels_hot = pd.get_dummies(correctlabels_df)

    score = []
    for row in range(np.size(df_file,0)):
        for col in range(np.size(df_file,1)):
            score_each_element = df_file.iloc[row,col]- correctlabels_hot.iloc[row,col]
            score_each_element = np.power(score_each_element, 2)
            score.append(score_each_element)

    brier_score = sum(score)/np.size(df_file,0)
    
    return(brier_score)


## 1. Define the class DecisionTree

In [4]:
class DecisionTree():
    
    def __init__(self, class_label = 'CLASS'):
        self.class_label = class_label
        self.binning = None
        self.imputation = None
        self.labels = None
        self.model = None
        
    def class_entropy(self, dataframe):
        feature = self.class_label
        
        column = dataframe[feature]
        column_len = column.shape[0]
        column_values = column.unique()
        
        def test(col_val):
            Pi = column[column == col_val].shape[0]/column_len
            return Pi*np.log2(Pi)
        
        S = - sum([test(col_val) for col_val in column_values])
        return S
    
    def feature_entropy(self, dataframe, feature, feature_val):
        column = dataframe[feature]
        S_feature_val = self.class_entropy(dataframe[column == feature_val])
        
        return S_feature_val
    
    def info_gain(self, dataframe, feature):
        S_0 = self.class_entropy(dataframe)
        
        column = dataframe[feature]
        column_len = column.shape[0]

        column_values = dataframe[feature].unique()
        
        def test(col_val):
            N_i = column[column == col_val].shape[0]
            S_col_val = self.feature_entropy(dataframe, feature, col_val)
            return S_col_val * N_i/column_len
        
        inf_gain = S_0 - sum([test(col_val) for col_val in column_values])
        return inf_gain
    
    
    def split_on_feature(self, dataframe, node_dict, feature, min_samples_split):
        column = dataframe[feature]
        
        dataframe_without_col = dataframe.loc[:, ~dataframe.columns.isin([feature])]
        
        for value in column.unique():
            node_dict[value] = dict()
            
            dataframe_split = dataframe_without_col[column == value]
            class_prob = dataframe_split[self.class_label].value_counts() / dataframe_split.shape[0]
            node_dict[value]['class_prob'] = class_prob
            
            if(max(class_prob) == 1):
                node_dict[value]['isleaf'] = True

            else:
                node_dict[value]['isleaf'] = False
                self.grow_tree(dataframe_split, node_dict[value], min_samples_split)
                
        
        
    def grow_tree(self, dataframe, node_dict, min_samples_split):
        """ Divide and conquer algorithm """
        
        class_prob = dataframe[self.class_label].value_counts() / dataframe.shape[0]
        node_dict['class_prob'] = class_prob
        
        if(dataframe.shape[0] >= min_samples_split):
            max_info_gain = -np.inf
            max_info_gain_feature = None
            for feature in dataframe.columns:
                if feature == self.class_label:
                    continue
                
                info_gain = self.info_gain(dataframe, feature)
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    max_info_gain_feature = feature

            if(max_info_gain_feature != None):
                node_dict['feature'] = max_info_gain_feature
                node_dict['isleaf'] = False
                node_dict = self.split_on_feature(dataframe, node_dict, max_info_gain_feature, min_samples_split)
        else:
            node_dict['isleaf'] = True
            
        return node_dict
        
            
        
    def fit(self, dataframe, nobins=10, bintype="equal-width", min_samples_split = 5):
        
        dataframe_discrete, self.binning=create_bins(dataframe,nobins,bintype)
        
        dataframe_imputation, self.imputation = create_imputation(dataframe_discrete)
        
        self.labels = dataframe['CLASS'].astype('category').unique()
        
        feature_cols = dataframe_imputation.loc[:, ~dataframe_imputation.columns.isin(['ID'])]

        self.model = dict()
        self.grow_tree(feature_cols, self.model, min_samples_split)
        
        
    def propagate_tree(self, root_node, row):
        node = root_node
        while not node['isleaf']:
            if 'feature' in node.keys():
                feature = node['feature']
            else:
                break
            
            feature_val = row[feature]
            if feature_val in node:
                node = node[feature_val]
            else:
                break
             
        if 'class_prob' in node.keys():
            return node['class_prob']
        else:
            all_class = [class_prob for key, class_prob in node.items() 
                             if key not in ['isleaf', 'feature']]
            
            class_prob = pd.sum(pd.concat(all_class, axis=1), axis=1)/len(all_class)
        
            return class_prob
            
    
    def predict(self, dataframe):
        
        dataframe_disc = apply_bins(dataframe, self.binning)
        dataframe_imp = apply_imputation(dataframe_disc, self.imputation)

        feature_cols = dataframe_imp.loc[:, ~dataframe_imp.columns.isin(['ID'])]
        
        df_predictions = pd.DataFrame(0, index=dataframe_imp.index, columns=self.labels,dtype=float)
        for index,row in feature_cols.iterrows():
            class_prob = self.propagate_tree(self.model, row)
            df_predictions.loc[index] = class_prob
        

        return df_predictions.fillna(0)

## 2. Define the class DecisionForest

In [8]:
class DecisionForest(DecisionTree):
    def __init__(self, class_label = 'CLASS'):
        self.class_label=class_label
        self.binning = None
        self.impuation = None
        self.labels = None
        self.model = None
        
    def tree_fit(self, tree, dataframe_boot, nobins=10, bintype="equal-width", min_samples_split = 5):
        feature_cols = dataframe_boot.loc[:, ~dataframe_boot.columns.isin(['ID'])]

        tree.model = dict()
        tree.grow_tree(feature_cols, tree.model, min_samples_split)
        
    def fit(self, dataframe, nobins=10, bintype="equal-width", min_samples_split=5, random_features=2, notrees=10):
        
        dataframe_discrete, self.binning=create_bins(dataframe,nobins,bintype)
        dataframe_imputation,self.imputation = create_imputation(dataframe_discrete)

        
        self.labels = dataframe['CLASS'].astype('category').unique()
        dataframe_features = dataframe_imputation.loc[:, ~dataframe_imputation.columns.isin(['ID','CLASS'])]
        
        self.model = []
        for i in range(notrees):
            random_feature_cols = list(np.random.choice(dataframe_features.columns, random_features, replace=False))
            
            bootstrap_df = dataframe_imputation.loc[:, ['ID']+random_feature_cols+['CLASS']]
            
            tree_model = DecisionTree()
            self.tree_fit(tree_model, bootstrap_df, nobins, bintype, min_samples_split)
            self.model.append(tree_model)
        
    def tree_predict(self, tree, dataframe_disc):
        df_predictions = pd.DataFrame(0, index=dataframe_disc.index, columns=self.labels, dtype=float)
        for index,row in dataframe_disc.iterrows():
            class_prob = tree.propagate_tree(tree.model, row)
            df_predictions.loc[index] = class_prob
        
        return df_predictions.fillna(0)
        
    def predict(self, dataframe):

        dataframe_disc = apply_bins(dataframe, self.binning)
        dataframe_imp = apply_imputation(dataframe_disc,self.imputation)
        
        feature_cols = dataframe_imp.loc[:, ~dataframe_imp.columns.isin(['ID'])]

        df_predictions = pd.DataFrame(0, index=dataframe.index, columns=self.labels,dtype=float)
        for index,tree in enumerate(self.model):
            df_prediction = self.tree_predict(tree, feature_cols)
            
            df_predictions += df_prediction
                
        df_predictions = df_predictions/len(self.model)
            
        return df_predictions.fillna(0)
    

In [11]:
glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

forest_model = DecisionForest()

test_labels = glass_test_df["CLASS"]

min_samples_split_values = [1,2,5]
random_features_values = [1,2,5]

parameters = [(min_samples_split,random_features) for min_samples_split in min_samples_split_values 
              for random_features in random_features_values]

results = np.empty((len(parameters),2))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    forest_model.fit(glass_train_df,min_samples_split=parameters[i][0],random_features=parameters[i][1])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    t0 = time.perf_counter()
    predictions = forest_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([min_samples_split_values,random_features_values]),
                       columns=["Accuracy","Brier score"])

results

Training time (1, 1): 0.46 s.
Testing time (1, 1): 0.99 s.
Training time (1, 2): 1.69 s.
Testing time (1, 2): 0.99 s.
Training time (1, 5): 9.70 s.
Testing time (1, 5): 1.19 s.
Training time (2, 1): 0.47 s.
Testing time (2, 1): 1.08 s.
Training time (2, 2): 2.02 s.
Testing time (2, 2): 1.04 s.
Training time (2, 5): 9.92 s.
Testing time (2, 5): 1.20 s.
Training time (5, 1): 0.41 s.
Testing time (5, 1): 1.05 s.
Training time (5, 2): 2.07 s.
Testing time (5, 2): 0.97 s.
Training time (5, 5): 6.18 s.
Testing time (5, 5): 0.97 s.


Unnamed: 0,Unnamed: 1,Accuracy,Brier score
1,1,0.64486,0.584572
1,2,0.663551,0.510137
1,5,0.700935,0.412483
2,1,0.588785,0.61699
2,2,0.700935,0.518253
2,5,0.672897,0.439244
5,1,0.53271,0.599499
5,2,0.64486,0.491858
5,5,0.64486,0.455518


In [12]:
train_labels = glass_train_df["CLASS"]
forest_model.fit(glass_train_df,min_samples_split=1)
predictions = forest_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

Accuracy on training set: 0.85
Brier score on training set: 0.36


### Comment on assumptions, things that do not work properly, etc.