
Author = Vedika Srivastava <br>
Date Created = 12/04/2021<br>
Title = Decision_Tree


In [14]:
#import necessary libraries
import numpy as np
import pandas as pd
from pprint import pprint

In [5]:
#define data set
data = { 'Age' : ["<=30","<=30","31-40",">40",">40",">40","31-40","<=30","<=30",">40","<=30","31-40","31-40",">40"],
        'Income' : ["high","high","high","medium","low","low","low","medium","low","medium","medium","medium","high","medium"],
        'Student' : ['no','no','no','no','yes','yes','yes','no','yes','yes','yes','no','yes','no'],
        'Credit_rating' : ['fair','excellent','fair','fair','fair','excellent','low','fair','fair','fair','excellent',
                           'excellent','fair','excellent'],
        'Buys_computer' : ['no','yes','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']}

df = pd.DataFrame(data,columns=['Age','Income','Student','Credit_rating','Buys_computer'])

In [6]:
#display dataset
df

Unnamed: 0,Age,Income,Student,Credit_rating,Buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,yes
2,31-40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31-40,low,yes,low,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [104]:
#defining function
def entropy_target(tar,dataset=df):
    """
    Function is used to calculate overall entropy
    ________________________________
    Parameters -
    dataset : default value = df
              Stores the dataset
    tar : default value = target
          Stores target variable
    ________________________________
    Return -
    Ea : stores entropy of attribute
    """
    E_target = 0
    for value in df[tar].unique():
        frac = df[tar].value_counts()[value]/len(df[tar])
        E_target += -frac * np.log2(frac)
    return(E_target)



def entropy_attribute(attribute,dataset=df):
    """
    Function is used to calculate entropy of an attribute present in the data set
    ________________________________
    Parameters -
    attribute : contains the attribute of dataset whose entropy is to be calculated
    dataset : default value = df
              Stores the dataset
    tar : default value = target
          Stores target variable
    ________________________________
    Return -
    Ea : stores entropy of attribute
    """
    tar = dataset.columns[-1]
    Ea = 0 #entropy of attribute
    for value in dataset[attribute].unique():
        Ef = 0  #entropy of each feature
        d = dataset[dataset[attribute] == value]
        for t in dataset[tar].unique():
            frac = len(d[d[target]==t])/len(d)
            Ef += -frac * np.log2(frac + np.finfo(float).eps)
        Ea += (len(d)/len(dataset))*Ef
    return(Ea)



def information_gain(attribute,E_s = E_target,E_p = E_predictors):
    """
    Function to returns information gain of an attribute
    ________________________________
    Parameters -
    attribute : contains the attribute of dataset whose IG is to be calculated
    E_s : deafult value = E_target
          stores overall entropy
    E_p : deafult value = E_predictors
          stores attribute entropies
    """
    return(E_s - E_p[attribute])



def winner(data):
    """
    Function to return the winner attribute usign IG
    ________________________________
    Parameters -
    data : stores dataset
    """
    IG = []
    Et = entropy_target(data.columns[-1],data)
    for k in data.keys()[:-1]:
        IG.append(Et - entropy_attribute(k,data))
    return (data.keys()[:-1][np.argmax(IG)])

In [58]:
#variables
predictors = list(df.columns[0:-1])
target = df.columns[-1]

In [78]:
#Entropies
E_target = entropy_target(target)
E_predictors = {p : entropy_attribute(p) for p in predictors}
print("Overall entropy E(s) =",E_target)
print("Attribute entropies are \n",E_predictors)

Overall entropy E(s) = 0.863120568566631
Attribute entropies are 
 {'Age': 0.6935361388961914, 'Income': 0.8571428571428564, 'Student': 0.7884504573082889, 'Credit_rating': 0.8103555691390284}


In [86]:
Ig = {p : information_gain(p) for p in predictors}
print("Information gains are \n",Ig)

Information gains are 
 {'Age': 0.16958442967043963, 'Income': 0.005977711423774568, 'Student': 0.07467011125834211, 'Credit_rating': 0.0527649994276026}


In [115]:
#building the decision tree
def build(df,tree = None):
    freatures = df.keys()[-1]
    node = winner(df)
    node_vals = np.unique(df[node])
    if tree is None : 
        tree = {}
        tree[node]={}
    for val in node_vals:
        d = df[df[node]==val].reset_index(drop=True)
        cl,count = np.unique(d[target],return_counts = True)
        if (len(count)==1):
            tree[node][val] = cl[0]
        else:
            tree[node][val] = build(d)
    return tree

In [116]:
t = build(df)
pprint(t)

{'Age': {'31-40': 'yes',
         '<=30': {'Student': {'no': {'Credit_rating': {'excellent': 'yes',
                                                       'fair': 'no'}},
                              'yes': 'yes'}},
         '>40': {'Credit_rating': {'excellent': 'no', 'fair': 'yes'}}}}


#  <center><br><br>Decision Tree - Function</center>

In [121]:
def decision_tree(df):
    def entropy_target(dataset):
        """
        Function is used to calculate overall entropy
        ________________________________
        Parameters -
        dataset : dataset : stores passed dataframe
        ________________________________
        Return -
        Ea : stores entropy of attribute
        """
        tar = dataset.columns[-1]
        E_target = 0
        for value in dataset[tar].unique():
            frac = dataset[tar].value_counts()[value]/len(dataset[tar])
            E_target += -frac * np.log2(frac)
        return(E_target)



    def entropy_attribute(attribute,dataset):
        """
        Function is used to calculate entropy of an attribute present in the data set
        ________________________________
        Parameters -
        attribute : contains the attribute of dataset whose entropy is to be calculated
        dataset : dataset : stores passed dataframe
        ________________________________
        Return -
        Ea : stores entropy of attribute
        """
        tar = dataset.columns[-1]
        Ea = 0 #entropy of attribute
        for value in dataset[attribute].unique():
            Ef = 0  #entropy of each feature
            d = dataset[dataset[attribute] == value]
            for t in dataset[tar].unique():
                frac = len(d[d[target]==t])/len(d)
                Ef += -frac * np.log2(frac + np.finfo(float).eps)
            Ea += (len(d)/len(dataset))*Ef
        return(Ea)



    def information_gain(attribute,dataset):
        """
        Function to returns information gain of an attribute
        ________________________________
        Parameters -
        attribute : contains the attribute of dataset whose IG is to be calculated
        dataset : stores passed dataframe
        ________________________________
        Return -
        information gain of the attribute
        """
        E_s = entropy_target(dataset)
        E_p = entropy_attribute(attribute,dataset)
        return(E_s - E_p[attribute])



    def winner(data):
        """
        Function to return the winner attribute usign IG
        ________________________________
        Parameters -
        data : stores dataset
        """
        IG = []
        Et = entropy_target(data)
        for k in data.keys()[:-1]:
            IG.append(Et - entropy_attribute(k,data))
        return (data.keys()[:-1][np.argmax(IG)])
    
    
    def build(df,tree = None):
        """
        Function to build the tree
        """
        freatures = df.keys()[-1]
        node = winner(df)
        node_vals = np.unique(df[node])
        if tree is None : 
            tree = {}
            tree[node]={}
        for val in node_vals:
            d = df[df[node]==val].reset_index(drop=True)
            cl,count = np.unique(d[target],return_counts = True)
            if (len(count)==1):
                tree[node][val] = cl[0]
            else:
                tree[node][val] = build(d)
        return tree
    
    t = build(df)
    pprint(t)

#  <center><br><br>FINAL OUTPUT</center>

In [119]:
#data set
data = { 'Age' : ["<=30","<=30","31-40",">40",">40",">40","31-40","<=30","<=30",">40","<=30","31-40","31-40",">40"],
        'Income' : ["high","high","high","medium","low","low","low","medium","low","medium","medium","medium","high","medium"],
        'Student' : ['no','no','no','no','yes','yes','yes','no','yes','yes','yes','no','yes','no'],
        'Credit_rating' : ['fair','excellent','fair','fair','fair','excellent','low','fair','fair','fair','excellent',
                           'excellent','fair','excellent'],
        'Buys_computer' : ['no','yes','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']}

df = pd.DataFrame(data,columns=['Age','Income','Student','Credit_rating','Buys_computer'])

df

Unnamed: 0,Age,Income,Student,Credit_rating,Buys_computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,yes
2,31-40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31-40,low,yes,low,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [122]:
#Decision tree of the dataset
decision_tree(df)

{'Age': {'31-40': 'yes',
         '<=30': {'Student': {'no': {'Credit_rating': {'excellent': 'yes',
                                                       'fair': 'no'}},
                              'yes': 'yes'}},
         '>40': {'Credit_rating': {'excellent': 'no', 'fair': 'yes'}}}}
