In [1]:
import numpy as np
import pandas as pd

In [2]:
eps = np.finfo(float).eps
eps

2.220446049250313e-16

# Generating Dataset

In [3]:
Age = 'junior,middle,senior,senior,middle,junior,junior,middle,middle,junior,junior,senior,middle,junior,junior,senior,senior'.split(',')
Married = 'yes,no,no,no,yes,no,yes,yes,no,no,no,yes,yes,no,no,yes,yes'.split(',')
Salary = 'high,low,low,low,high,high,low,high,low,low,high,low,high,high,high,high,high'.split(',')
Home_owner = 'yes,yes,no,yes,yes,yes,yes,no,no,no,yes,no,yes,yes,no,yes,no'.split(',')
Loan_worthy = 'yes,no,no,no,yes,yes,yes,yes,no,no,no,yes,yes,yes,no,yes,yes'.split(',')

In [4]:
df = pd.DataFrame()

In [14]:
df["Age"] = Age
df["Married"] = Married
df['Salary'] = Salary
df["Home_owner"] = Home_owner
df['Loan_worthy'] = Loan_worthy

In [15]:
df.shape

(17, 5)

In [16]:
df

Unnamed: 0,Age,Married,Salary,Home_owner,Loan_worthy
0,junior,yes,high,yes,yes
1,middle,no,low,yes,no
2,senior,no,low,no,no
3,senior,no,low,yes,no
4,middle,yes,high,yes,yes
5,junior,no,high,yes,yes
6,junior,yes,low,yes,yes
7,middle,yes,high,no,yes
8,middle,no,low,no,no
9,junior,no,low,no,no


### Finding entropy of the entire dataset

In [17]:
def entire_entropy(df):
    target = df.keys()[-1]
    values,counts = np.unique(df[target],return_counts=True)
    p = counts/np.sum(counts)
    entropy = -p*np.log2(p+eps) 
    return np.sum(entropy)
entire_entropy(df)

0.977417817528171

### Finding the gini index of the dataset

In [18]:
def entire_gini(df):
    target = df.keys()[-1]
    values,counts = np.unique(df[target],return_counts=True)
    p = counts/np.sum(counts)
    
    return np.sum(p*(1-p))
    
entire_gini(df)

0.4844290657439446

### Entropy of attribute

In [19]:
def entropy_attribute(df,attribute):
    target = df.keys()[-1]
    values_in_target = np.unique(df[target])
    
    values_in_attribute = np.unique(df[attribute])
    attribute_entropy = 0
    
    for value_a in values_in_attribute:
        overall_entropy = 0
        for value__t in values_in_target:
            num = len(df[attribute][df[attribute]==value_a][df[target]==value__t])
            den = len(df[attribute][df[attribute]==value_a])
            p = num/(den+eps)
            overall_entropy += -p*np.log2(p+eps)
            
        p2 = den/len(df)
        attribute_entropy += p2*overall_entropy
    return abs(attribute_entropy)

In [20]:
entropy_attribute(df,'Age')

0.976829582163908

In [21]:
def best_attribute_to_divide(df):
    target = df.keys()[-1]
    attributes = df.keys()[:-1]
    E = []
    for attribute in attributes:
        E.append(entropy_attribute(df,attribute))
    best_attribute = np.argmin(E)
    return attributes[best_attribute]
best_attribute_to_divide(df)

'Married'

# Decesion Trees For Rergression

In [22]:
present_price = np.array([5.59, 9.54, 9.85, 4.15, 6.87, 9.83, 8.12, 8.61, 8.89, 8.92]).reshape(10, -1)
km_driven = np.array([27000, 43000, 6900, 5200, 42450, 2071, 18796, 33429, 20273, 42376]).reshape(10, -1)
age = np.array([8, 9, 5, 11, 8, 4, 7, 7, 6, 7]).reshape(10, -1)
selling_price = np.array([3.35, 4.75, 7.25, 2.85, 4.6, 9.25, 6.75, 6.5, 8.75, 7.45]).reshape(10, -1)

In [23]:
df = np.concatenate([present_price,km_driven,age,selling_price],axis=1)

In [24]:
df

array([[5.5900e+00, 2.7000e+04, 8.0000e+00, 3.3500e+00],
       [9.5400e+00, 4.3000e+04, 9.0000e+00, 4.7500e+00],
       [9.8500e+00, 6.9000e+03, 5.0000e+00, 7.2500e+00],
       [4.1500e+00, 5.2000e+03, 1.1000e+01, 2.8500e+00],
       [6.8700e+00, 4.2450e+04, 8.0000e+00, 4.6000e+00],
       [9.8300e+00, 2.0710e+03, 4.0000e+00, 9.2500e+00],
       [8.1200e+00, 1.8796e+04, 7.0000e+00, 6.7500e+00],
       [8.6100e+00, 3.3429e+04, 7.0000e+00, 6.5000e+00],
       [8.8900e+00, 2.0273e+04, 6.0000e+00, 8.7500e+00],
       [8.9200e+00, 4.2376e+04, 7.0000e+00, 7.4500e+00]])

### Split function

In [25]:
def split(dataset,features_index,threshold):
    right = []
    left = []
    for rows in dataset:
        if rows[features_index]>=threshold:
            right.append(rows)
        else:
            left.append(rows)
        
    return np.array(right),np.array(left)
data_right,data_left = split(df,0,6)

In [26]:
def variance_reduction(df,l_child,r_child):
    weight_l = len(l_child)/len(df)
    weight_2 = len(r_child)/len(df)
    
    reduction = np.var(df) - (weight_l*np.var(l_child) + weight_2*np.var(r_child))
    
    return reduction
variance_reduction(df[:,-1],data_left[:,-1],data_right[:,-1])

2.3256249999999996

In [27]:
def node_value(Y):
    return np.sum(Y)/len(Y)

node_value(data_left[:,-1])

3.1

In [40]:
def get_best_split(dataset):
    n_samples,n_features = dataset.shape
    best_split={}
    max_var_red = -float('inf')
    
    for feature_index in range(n_features):
        features_values = dataset[:,feature_index]
        possible_thresholds = np.unique(features_values)
        
        
        for threshold in possible_thresholds:
            dataset_left,dataset_right = split(dataset,feature_index,threshold)
            
            if(len(dataset_left)>0) and len(dataset_right)>0:
                y,left__y,right__y = dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
                
                curr_var_red = variance_reduction(y,left__y,right__y)
                
                if curr_var_red> max_var_red:
                    best_split['feature_index'] = feature_index
                    best_split['threshold'] = threshold
                    best_split['dataset_left'] = dataset_left
                    best_split['dataset_right'] = dataset_right
                    best_split['var_reduced'] = curr_var_red
                    
                    max_var_red=curr_var_red
                     
    return best_split
    
    
    
    

In [41]:
get_best_split(df)

{'feature_index': 2,
 'threshold': 8.0,
 'dataset_left': array([[5.590e+00, 2.700e+04, 8.000e+00, 3.350e+00],
        [9.540e+00, 4.300e+04, 9.000e+00, 4.750e+00],
        [4.150e+00, 5.200e+03, 1.100e+01, 2.850e+00],
        [6.870e+00, 4.245e+04, 8.000e+00, 4.600e+00]]),
 'dataset_right': array([[9.8500e+00, 6.9000e+03, 5.0000e+00, 7.2500e+00],
        [9.8300e+00, 2.0710e+03, 4.0000e+00, 9.2500e+00],
        [8.1200e+00, 1.8796e+04, 7.0000e+00, 6.7500e+00],
        [8.6100e+00, 3.3429e+04, 7.0000e+00, 6.5000e+00],
        [8.8900e+00, 2.0273e+04, 6.0000e+00, 8.7500e+00],
        [8.9200e+00, 4.2376e+04, 7.0000e+00, 7.4500e+00]]),
 'var_reduced': 3.412604166666666}