In [3]:
import numpy as np
import pandas as pd

eps = np.finfo(float).eps
eps

2.220446049250313e-16

In [2]:
Age = 'junior,middle,senior,senior,middle,junior,junior,middle,middle,junior,junior,senior,middle,junior,junior,senior,senior'.split(',')
Married = 'yes,no,no,no,yes,no,yes,yes,no,no,no,yes,yes,no,no,yes,yes'.split(',')
Salary = 'high,low,low,low,high,high,low,high,low,low,high,low,high,high,high,high,high'.split(',')
Home_owner = 'yes,yes,no,yes,yes,yes,yes,no,no,no,yes,no,yes,yes,no,yes,no'.split(',')
Loan_worthy = 'yes,no,no,no,yes,yes,yes,yes,no,no,no,yes,yes,yes,no,yes,yes'.split(',')

df = pd.DataFrame()

df["Age"] = Age
df["Married"] = Married
df['Salary'] = Salary
df["Home_owner"] = Home_owner
df['Loan_worthy'] = Loan_worthy

## Entropy of the dataset

Entropy of a dataset is calculated by:
$$E = -p\times log_{2}(p)$$

Where **p** is the **probability of occurence** of a particular label

In [4]:
def calcualte_entropy_of_dataset(df):
    target = df.keys()[-1]
    
    n_classes,class_counts = np.unique(df[target],return_counts=True)
    
    p = class_counts/np.sum(class_counts)
    
    entropy = -p*np.log2(p+eps)
    
    return np.sum(entropy)


calcualte_entropy_of_dataset(df)

0.977417817528171

## Gini index of the dataset

Gini index of the dataset is calculated by the following:
$$G = \sum p*(1 - p)$$

Where **p** is the **probability of occurence** of a particular label

In [5]:
def gini_of_dataset(df):
    target = df.keys()[-1]
    
    n_classes,class_counts = np.unique(df[target],return_counts=True)
    
    p = class_counts/np.sum(class_counts)
    
    gini = p*(1-p)
    
    return np.sum(gini)


gini_of_dataset(df)

0.4844290657439446

## Entropy of an attribute

In [9]:
def entropy_of_attribute(df,attribute):
    target = df.keys()[-1]
    
    values_in_target = np.unique(df[target])
    
    values_in_attribute = np.unique(df[attribute])
    
    attribute_entropy = 0
    for value_of_attribute in values_in_attribute:
        overall_entropy = 0
        for value_of_target in values_in_target:
            num = len(df[attribute][df[attribute] == value_of_attribute][df[target]==value_of_target])
            den = len(df[attribute][df[attribute]==value_of_attribute])
            
            p = num/(den+eps)
            
            overall_entropy += -p*np.log2(p+eps)
            
        p2 = den/len(df)
        
        attribute_entropy += overall_entropy*p2
        
    return abs(attribute_entropy)

for att in df.keys()[:-1]:
    print(f'{att},{entropy_of_attribute(df,att)}')

Age,0.976829582163908
Married,0.4045788563869161
Salary,0.7800661722847075
Home_owner,0.9240885849733339


## Best attribute of divide

In [10]:
def best_attribute_to_divide(df):
    IG = []
    attributes = df.keys()[:-1]
    target = df.keys()[-1]
    
    for attribute in attributes:
        ig = calcualte_entropy_of_dataset(df) - entropy_of_attribute(df,attribute)
        print(f'{attribute},{ig}')
        IG.append(calcualte_entropy_of_dataset(df) - entropy_of_attribute(df,attribute))
        
        
    return attributes[np.argmax(IG)]

best_attribute_to_divide(df)

Age,0.0005882353642630012
Married,0.572838961141255
Salary,0.1973516452434635
Home_owner,0.053329232554837125


'Married'

# Decesion Trees for regression

In [11]:
present_price = np.array([5.59, 9.54, 9.85, 4.15, 6.87, 9.83, 8.12, 8.61, 8.89, 8.92]).reshape(10, -1)
km_driven = np.array([27000, 43000, 6900, 5200, 42450, 2071, 18796, 33429, 20273, 42376]).reshape(10, -1)
age = np.array([8, 9, 5, 11, 8, 4, 7, 7, 6, 7]).reshape(10, -1)
selling_price = np.array([3.35, 4.75, 7.25, 2.85, 4.6, 9.25, 6.75, 6.5, 8.75, 7.45]).reshape(10, -1)

df = np.concatenate([present_price,km_driven,age,selling_price],axis=1)

## Split the dataset according to the threshold

We will split the dataset by the rows such that:

For every row we will consider a particular index. If the value of that feature in a particular row is greater than the threshold then we will put it on the right else on the left

In [15]:
def split_dataset(df,feature_index,threshold):
    right = []
    left = []
    
    for rows in df:
        if rows[feature_index]>=threshold:
            right.append(rows)
            
        else:
            left.append(rows)
            
    return right,left

split_dataset(df,2,7)

([array([5.59e+00, 2.70e+04, 8.00e+00, 3.35e+00]),
  array([9.54e+00, 4.30e+04, 9.00e+00, 4.75e+00]),
  array([4.15e+00, 5.20e+03, 1.10e+01, 2.85e+00]),
  array([6.870e+00, 4.245e+04, 8.000e+00, 4.600e+00]),
  array([8.1200e+00, 1.8796e+04, 7.0000e+00, 6.7500e+00]),
  array([8.6100e+00, 3.3429e+04, 7.0000e+00, 6.5000e+00]),
  array([8.9200e+00, 4.2376e+04, 7.0000e+00, 7.4500e+00])],
 [array([9.85e+00, 6.90e+03, 5.00e+00, 7.25e+00]),
  array([   9.83, 2071.  ,    4.  ,    9.25]),
  array([8.8900e+00, 2.0273e+04, 6.0000e+00, 8.7500e+00])])

## Variance reduction

We consider the reduction in variance that has happened with the split of the data sets by subtracting the weighted average of the two

In [16]:
def variance_reduction(df,l_child,r_child):
    weight_1 = len(l_child)/len(df)
    weight_2 = len(r_child)/len(df)
    
    var_red = np.var(df) - (weight_1*np.var(l_child) + weight_2*np.var(r_child))
    
    return var_red

## Get the best split

In [26]:
def get_best_split(dataset):
    n_samples,n_features = dataset.shape
    best_split={}
    max_var_red = -float('inf')
    
    for feature_index in range(n_features):
        features_values = dataset[:,feature_index]
        possible_thresholds = np.unique(features_values)
        
        
        for threshold in possible_thresholds:
            dataset_left,dataset_right = split_dataset(dataset,feature_index,threshold)
            
            if(len(dataset_left)>0) and len(dataset_right)>0:
                y,left__y,right__y = dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
                
                curr_var_red = variance_reduction(y,left__y,right__y)
                
                if curr_var_red> max_var_red:
                    best_split['feature_index'] = feature_index
                    best_split['threshold'] = threshold
                    best_split['dataset_left'] = dataset_left
                    best_split['dataset_right'] = dataset_right
                    best_split['var_reduced'] = curr_var_red
                    
                    max_var_red=curr_var_red
                     
    return best_split
get_best_split(df)

TypeError: list indices must be integers or slices, not tuple

In [23]:
df[:,-1]

array([3.35, 4.75, 7.25, 2.85, 4.6 , 9.25, 6.75, 6.5 , 8.75, 7.45])