## Build Decision Tree to Classify Salary Based on Other Variables

### Only consider splits at the root

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame(np.array([[1,1,3,3],[1,2,1,1],[1,2,2,2],[2,2,1,3],[2,2,2,3],[2,1,2,4],[2,1,3,4],[3,1,3,3],[3,2,2,2],
                          [4,1,4,2],[4,2,1,1]]),columns=['Occupation','Gender','Age','Salary'])
df

Unnamed: 0,Occupation,Gender,Age,Salary
0,1,1,3,3
1,1,2,1,1
2,1,2,2,2
3,2,2,1,3
4,2,2,2,3
5,2,1,2,4
6,2,1,3,4
7,3,1,3,3
8,3,2,2,2
9,4,1,4,2


In [3]:
## Calculate entropy of salary
Salary=df['Salary']
proportion=Salary.value_counts()/len(Salary)
prop1=proportion.sort_index()
prop1.to_frame()
print(prop1)
prop1=-prop1*np.log2(prop1)
h=sum(prop1)
print('Enrtopy before splitting is ',h)

1    0.181818
2    0.272727
3    0.363636
4    0.181818
Name: Salary, dtype: float64
Enrtopy before splitting is  1.9362600275315274


###### Entropies on candidate splits

In [4]:
## Consider splitting by Gender
Gender=df['Gender']
def entropy(Attribute):
    Attribute.to_frame()
    x=Attribute.unique()
    countbranches=[]
    prob=[]
    e=[]
    for i in range(len(x)):   ## branches
        valuex=Attribute[Attribute==i+1]
        countbranches=valuex.count()
        prob.append(countbranches/len(Attribute)) ## proportion of each branch 
        for j in range(4):   ## each level of outcome within branch i
            valuey=valuex[Salary==j+1]
            count=valuey.count()
            propAtt=count/countbranches
            if count==0:
                entropies=0  ## if predictor category i has no outcome j, then the corresponding entropy is 0
            else:
                entropies=-np.log2(propAtt)*propAtt 
            e.append(entropies)
            i+=1
    ## transforming list output to matrix      
    # entropy matrix is a n*4 matrix where n is determined by number of child notes in this attribute
    a=np.array(e)
    shape=(len(x),4)
    ent=a.reshape(shape)
    b=np.array(prob) 
    shape2=(len(x),1)
    wgt=b.reshape(shape2)
    branch_ent=np.sum(ent,axis=1) 
    ## Combine entropies for each branch with corresponding proportion 
    ## Use matrix multiplication
    Hs=np.matmul(branch_ent,wgt) 
    print('Entropy matrix',e)
    print('Weight of each child note',wgt)
    print('Overall entropy of',Attribute.name,'is',Hs)
    print('Information Gained represented by split on',Attribute.name, 'is',h-Hs)   
entropy(Attribute=Gender)

Entropy matrix [0, 0.46438561897747244, 0.5287712379549449, 0.5287712379549449, 0.5283208335737187, 0.5283208335737187, 0.5283208335737187, 0]
Weight of each child note [[0.45454545]
 [0.54545455]]
Overall entropy of Gender is [1.5563105]
Information Gained represented by split on Gender is [0.37994953]


In [5]:
## Apply the above function on splitting by Age
Age=df['Age']
entropy(Attribute=Age)

Entropy matrix [0.38997500048077083, 0, 0.5283208335737187, 0, 0, 0.5, 0.5, 0.5, 0, 0, 0.38997500048077083, 0.5283208335737187, 0, -0.0, 0, 0]
Weight of each child note [[0.27272727]
 [0.36363636]
 [0.27272727]
 [0.09090909]]
Overall entropy of Age is [1.04634318]
Information Gained represented by split on Age is [0.88991685]


In [6]:
## Consider splitting by Occupation
Occupation=df['Occupation']
entropy(Attribute=Occupation)

Entropy matrix [0.5283208335737187, 0.5283208335737187, 0.5283208335737187, 0, 0, 0, 0.5, 0.5, 0, 0.5, 0.5, 0, 0.5, 0.5, 0, 0]
Weight of each child note [[0.27272727]
 [0.36363636]
 [0.18181818]
 [0.18181818]]
Overall entropy of Occupation is [1.15953523]
Information Gained represented by split on Occupation is [0.7767248]


#### Conclusion: Since splitting on Age gives the highest information gain of 0.88991685, Age is chosen to be splitted first