# Mastering Decision Trees

### 2.3. Hands-on Scikit-learn


- Data Exploration


In [4]:
import pandas as pd
import numpy as np
import random

# rename your dataset to car-price.csv
df =pd.read_excel("car-price.xlsx")


# we will take two variables,
# we will use doornumber as our target
# and the others as our indpendent variables
df = df[['drivewheel','fueltype','aspiration','doornumber','carbody']]

df.sample(n=10)

Unnamed: 0,drivewheel,fueltype,aspiration,doornumber,carbody
92,fwd,gas,std,four,sedan
52,fwd,gas,std,two,hatchback
87,fwd,gas,turbo,four,sedan
27,fwd,gas,turbo,two,sedan
101,fwd,gas,std,four,sedan
113,rwd,gas,std,four,wagon
29,fwd,gas,turbo,two,hatchback
120,fwd,gas,std,four,hatchback
152,fwd,gas,std,four,hatchback
165,rwd,gas,std,two,sedan


- Calculating Gini Index

In [6]:
# function will calculate gini_index for each column
# function from scratch
# in a dataframe
# and print out the best column to split on
import pandas as pd
import numpy as np

def gini_index(dataset, targetcol):
    
    # store all of our columns and gini scores
    gini_scores = []
    
    # iterate through each column in your dataframe
    for col in dataset.columns:
        
        # skip our target column
        # no information gain on target columns!
        # we can't split here
        if col == targetcol:
            continue
        
        # resets for each column in your dataset
        gini = 0
        
        # get the value counts for that column
        unique_values = dataset[col].value_counts()
        
        # iterate through each unique value for that column
        for key, val in unique_values.items():
        
            # get the target variable separated, based on
            # the independent variable
            filteredDf = dataset[targetcol][dataset[col] == key].value_counts()
            
            # need n for the length
            n = len(dataset)
            
            # sum of the value counts for that column
            ValueSum = filteredDf.sum()
            
            # need the probabilities of each class
            p = 0
            
            # we now have to send it to our gini impurity formula
            for i, j in filteredDf.items():
                p += (filteredDf[i] / ValueSum) ** 2
            
            # gini total for column 
            # is all uniques from each column
            gini += (val / n) * (1-p)

        print(f'Variable {col} has Gini Index of {round(gini,4)}\n')
        
        # append our column name and gini score
        gini_scores.append((col, gini))
    
    # sort our gini scores lowest to highest
    split_pair = sorted(gini_scores, key=lambda x: -x[1], reverse=True)[0]
    
    # print out the best score
    print(f'''Split on {split_pair[0]} With Gini Index of {round(split_pair[1],3)}''')
        
        
final = gini_index(df, 'doornumber')


Variable drivewheel has Gini Index of 0.4865

Variable fueltype has Gini Index of 0.4745

Variable aspiration has Gini Index of 0.4921

Variable carbody has Gini Index of 0.2137

Split on carbody With Gini Index of 0.214


- Calculating Entropy

In [7]:
import numpy as np
import pandas as pd
import math

def entropy(dataset, targetcol):
    # store all of our columns and gini scores
    entropy_scores = []
    
    # iterate through each column in your dataframe
    for col in dataset.columns:
        
        if col == targetcol:
            continue
        
        # get the value_counts normalized, saving us having to iterate through
        # each variable
        value_counts = dataset[col].value_counts(normalize=True, sort=False)
        
        # calculate our entropy for the column
        entropy = -(value_counts * np.log(value_counts) / np.log(math.e)).sum()
        
        print(f'Variable {col} has Entropy of {round(entropy,4)}\n')
        
        # append our column name and gini score
        entropy_scores.append((col,entropy))
    
    # sort our gini scores lowest to highest
    split_pair = sorted(entropy_scores, key=lambda x: -x[1], reverse=True)[0]
    
    # print out the best score
    print(f'''Split on {split_pair[0]} With Information Gain of {round(1-split_pair[1],3)}''')
        
        

final = entropy(df, 'carbody')
final

Variable drivewheel has Entropy of 0.8186

Variable fueltype has Entropy of 0.3197

Variable aspiration has Entropy of 0.4721

Variable doornumber has Entropy of 0.6857

Split on fueltype With Information Gain of 0.68


In [8]:
from datetime import datetime
start_time = datetime.now()
# do your work here
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:00
