In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import operator
import pprint

pp = pprint.PrettyPrinter(indent=2)

### Data Preparation


In [38]:
# dataset = pd.read_csv('data/car_eval_data/car.data').dropna()
dataset=pd.read_csv('/mnt/c/Users/Satwick Sen Sarma/Documents/IITK/5TH_SEM/THESIS/data/things_specifications_significane.csv')
dataset

Unnamed: 0,centering,re-centering,zscoring_residuals,hrf_method,runs_used,mask_method,significant
0,psc,psc,True,mean,6-runs,intersection,No
1,psc,psc,True,mean,5-runs,intersection,No
2,psc,psc,True,mean,5-runs,union,Yes
3,psc,psc,True,mean,4-runs,intersection,No
4,psc,psc,True,mean,4-runs,union,No
...,...,...,...,...,...,...,...
331,center,off,False,mode,5-runs,union,No
332,center,off,False,mode,4-runs,intersection,No
333,center,off,False,mode,4-runs,union,No
334,center,off,False,mode,3-runs,intersection,No


In [39]:
dataset.loc[(dataset["centering"]!="psc") & (dataset["re-centering"]=="off") & (dataset["zscoring_residuals"]==True) &(dataset["hrf_method"]=="mean") & (dataset["runs_used"]=="5-runs")& (dataset["mask_method"]=="intersection")]

Unnamed: 0,centering,re-centering,zscoring_residuals,hrf_method,runs_used,mask_method,significant
197,z,off,True,mean,5-runs,intersection,Yes
309,center,off,True,mean,5-runs,intersection,Yes


In [40]:
dataset['is_significant'] = dataset['significant'].map({'Yes': 1, 'No': 0}).astype(int)

In [41]:
dataset['is_significant'].sum()

127

In [42]:
#initiate the dictionary to store the proportions
proportions={}
#iterate through the columns 
for column in dataset.columns[:-2]:
    proportions[column]={}
    #iterate through the unique values of the column
    for value in dataset[column].unique():
        #calculate the proportion of the value in the column
        target_dataset=dataset.loc[dataset[column]==value]['is_significant']
        proportions[column][value]=target_dataset.mean()
proportions
        
#pp.pprint(proportions)


{'centering': {'psc': 0.375,
  'z': 0.4107142857142857,
  'center': 0.3482142857142857},
 're-centering': {'psc': 0.2261904761904762,
  'z': 0.40476190476190477,
  're-center': 0.44047619047619047,
  'off': 0.44047619047619047},
 'zscoring_residuals': {True: 0.375, False: 0.38095238095238093},
 'hrf_method': {'mean': 0.38095238095238093, 'mode': 0.375},
 'runs_used': {'6-runs': 0.5,
  '5-runs': 0.4895833333333333,
  '4-runs': 0.21875,
  '3-runs': 0.3645833333333333},
 'mask_method': {'intersection': 0.4010416666666667,
  'union': 0.3472222222222222}}

In [43]:
print(dataset.dtypes)

centering             object
re-centering          object
zscoring_residuals      bool
hrf_method            object
runs_used             object
mask_method           object
significant           object
is_significant         int64
dtype: object


In [44]:
categorical_feats = list(dataset.columns)
for c in categorical_feats:
    dataset[c] = dataset[c].astype('category')
determiner = categorical_feats[-1]
features = categorical_feats[:-2]
print(dataset.dtypes)
print("Determiner Class: ", determiner)

centering             category
re-centering          category
zscoring_residuals    category
hrf_method            category
runs_used             category
mask_method           category
significant           category
is_significant        category
dtype: object
Determiner Class:  is_significant


### Writing Utility Functions

This section contains all the functions that are needed to actually determine the decisions in a decison tree. As you have read, decision tree works on the fundamental concept of entropy and the objective is to reduce the overall entropy of the dataset and what is the best way of splitting them.

For this we need three important concepts:
* Entropy = $-\sum_{n=1}^{i}p_i \times log(p_i)$ where $p_i$ is probability of being in class i
* Average Information = $-\sum_{c}p_c \times \sum_{n=1}^{i}p(i|c) \times log(p(i|c))$ where $p_c$ is probability of being in class c and $p(i|c)$ is the probability of being in class i while also being in class c.
* Information Gain = Entropy - Average Information

We aim to choose features which have the highest information gain for each split in the tree.

In [45]:
def calculate_entropy(feature):
    entropy = 0
    element, count = np.unique(feature, return_counts=True)
    for i in range(len(element)):
        prob = count[i]/np.sum(count)
        entropy -= prob*np.log2(prob)
    return entropy

In [46]:
def information_gain(data, split_feature, root_feature):
    E_S = calculate_entropy(data[root_feature])
    average_information = 0
    attributes, count = np.unique(data[split_feature], return_counts=True)
    for i in range(len(attributes)):
        split_data = data.where(data[split_feature] == attributes[i]).dropna()[root_feature]
        average_information += (count[i]/np.sum(count))*calculate_entropy(split_data)
    information_gain = E_S - average_information
    return information_gain

In [47]:
def split_dataset(data, feature, param):
    holder = data.where(data[feature] == param).dropna()
    return holder

In [48]:
print("Entropy: ", calculate_entropy(dataset[determiner]))
for i in range(len(features)):
    print("Class: ", features[i], "\t IF: ", information_gain(dataset, features[i], determiner))

Entropy:  0.9566001197282556
Class:  centering 	 IF:  0.0020097337807856164
Class:  re-centering 	 IF:  0.025479059472607313
Class:  zscoring_residuals 	 IF:  2.7176777207205127e-05
Class:  hrf_method 	 IF:  2.7176777207205127e-05
Class:  runs_used 	 IF:  0.04117514661145827
Class:  mask_method 	 IF:  0.0021836932175107737


### Recursive ID3 Algorithm

In [49]:
def ID3(data, features, target_name=determiner):
    if len(np.unique(data[target_name])) <= 1:
        return np.unique(data[target_name])[0]
    else:
        #Identify which feature to use for splitting
        feature_info_gain = [information_gain(data,feature,target_name) for feature in features]
        best_feature = features[np.argmax(feature_info_gain)]
        tree = {best_feature:{}}
        
        #create a new feature list
        feats = [i for i in features if i != best_feature]
        for param in np.unique(data[best_feature]):
            # Build a subdata splitting them based on the categories of the best feature
            subdata = split_dataset(data, best_feature, param)
            # Create sub-trees for that feature
            branch = ID3(subdata, feats, target_name)
            # Add the branch to the tree
            tree[best_feature][param] = branch
        return tree

In [50]:
dataset

Unnamed: 0,centering,re-centering,zscoring_residuals,hrf_method,runs_used,mask_method,significant,is_significant
0,psc,psc,True,mean,6-runs,intersection,No,0
1,psc,psc,True,mean,5-runs,intersection,No,0
2,psc,psc,True,mean,5-runs,union,Yes,1
3,psc,psc,True,mean,4-runs,intersection,No,0
4,psc,psc,True,mean,4-runs,union,No,0
...,...,...,...,...,...,...,...,...
331,center,off,False,mode,5-runs,union,No,0
332,center,off,False,mode,4-runs,intersection,No,0
333,center,off,False,mode,4-runs,union,No,0
334,center,off,False,mode,3-runs,intersection,No,0


In [51]:
dtree = ID3(dataset, features)
pp.pprint(dtree)

{ 'runs_used': { '3-runs': { 'mask_method': { 'intersection': 0,
                                              'union': { 're-centering': { 'off': 1,
                                                                           'psc': { 'centering': { 'center': 0,
                                                                                                   'psc': 1,
                                                                                                   'z': 0}},
                                                                           're-center': { 'centering': { 'center': { 'zscoring_residuals': { False: 0,
                                                                                                                                             True: 1}},
                                                                                                         'psc': 1,
                                                                                                      