<a href="https://colab.research.google.com/github/RajeeAravindh1/ir/blob/main/Classic_Example_Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [158]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier    
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report


dict1={"Day":["D1","D2","D3","D4","D5","D6","D7","D8","D9","D10","D11","D12","D13","D14"],
       "Outlook":["Sunny","Sunny","Overcast","Rain","Rain","Rain","Overcast","Sunny","Sunny","Rain","Sunny","Overcast","Overcast","Rain"],
       "Temp":["Hot","Hot","Hot","Mild","Cool","Cool","Cool","Mild","Cool","Mild","Mild","Mild","Hot","Mild"],
       "Humidity":["High","High","High","High","Normal","Normal","Normal","High","Normal","Normal","Normal","High","Normal","High"],
       "Wind":["Weak","Strong","Weak","Weak","Weak","Strong","Strong","Weak","Weak","Weak","Strong","Strong","Weak","Strong"],
       "PlayTennis":["No","No","Yes","Yes","Yes","No","Yes","No","Yes","Yes","Yes","Yes","Yes","No"]}




In [159]:
df=pd.DataFrame.from_dict(dict1)
df
new_df=df.iloc[:,1:]
new_df

Unnamed: 0,Outlook,Temp,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [160]:
def entropy_before_split(new_df):
    target_col_name = new_df.columns[-1]  
    entropy = 0
    unique_values = new_df[target_col_name].unique()    
    for value in unique_values:               
        probability_i = new_df[target_col_name].value_counts()[value] / len(new_df[target_col_name]) 
        entropy += -probability_i * np.log2(probability_i)   
    return entropy

In [161]:
def entropy_after_split(new_df, feature):
  target_col_name = new_df.columns[-1]                        
  feature_unique_values = new_df[feature].unique()    
  weighted_entropy = 0                           
  for feat_uniq_val in feature_unique_values:
      entropy_of_feat_uniq_val = 0 # To store the entropy value of a unique value of a feature variable.
      corres_avail_target_val = new_df.loc[new_df[feature] == feat_uniq_val, target_col_name].value_counts().index.values          
      for val in corres_avail_target_val:
          prob_val = new_df[new_df[feature] == feat_uniq_val][target_col_name].value_counts()[val] / new_df[new_df[feature] == feat_uniq_val].shape[0]
          entropy_of_feat_uniq_val += - prob_val * np.log2(prob_val)  
      prop_of_feat_uniq_val = new_df[new_df[feature] == feat_uniq_val].shape[0] / new_df.shape[0]
      weighted_entropy += prop_of_feat_uniq_val * entropy_of_feat_uniq_val 
  return weighted_entropy

In [162]:
def root_node_attribute(new_df): 
    entropies_list = []
    info_gain_list = []
    for feature in new_df.columns[:-1]:             
        entropies_list.append(entropy_after_split(new_df, feature))  
        info_gain_list.append(entropy_before_split(new_df) - entropy_after_split(new_df, feature))   
    print(f"Entropy list: {entropies_list},\ninfo_gain_list: {info_gain_list}")
    print(f"The feature with highest information gain is: {new_df.columns[:-1][np.argmax(info_gain_list)]}\n")
    return new_df.columns[:-1][np.argmax(info_gain_list)]
  


In [163]:
def get_subtable(new_df, node, value):
    return new_df[new_df[node] == value].reset_index(drop = True)

In [164]:
def printing_node(new_df):
  node = root_node_attribute(new_df)
  att_value = np.unique(df[node])
  target = 'PlayTennis'

  print(f"Root node ==> {node}\n")
  print(f"Attribute unique values ==> {att_value}\n")

  for value in att_value:
      subtable = get_subtable(new_df, node, value)
      target_labels, counts = np.unique(subtable[target], return_counts = True) 
      print(f"\nSubtable for Attribute Value {value}\n\n{subtable}") # Correspondng sub-table for each unique attribute value.
      print(f"\nTarget labels: {target_labels}") # Corresponding target labels for each unique attribute value.
      print(f"Corresponding counts: {counts}\n{'-' * 75}") # Corresponding counts for each label

In [167]:
def build_tree(new_df, tree = None): 
   
    target = "PlayTennis"  
    node = root_node_attribute(new_df)
    att_value = np.unique(new_df[node])
    
    if tree is None:                    
        tree = {}
        tree[node] = {}

    for value in att_value:
        subtable = get_subtable(new_df, node, value)
        target_labels, counts = np.unique(subtable[target], return_counts = True)  
        if len(counts) == 1:
            tree[node][value] = target_labels[0]                                                    
        else:  
            printing_node(new_df)
            printing_node(subtable)  
            tree[node][value] = build_tree(subtable) 
            
                   
    return tree

In [168]:
tree = build_tree(new_df)
print(tree)

Entropy list: [0.6935361388961918, 0.9110633930116763, 0.7884504573082896, 0.8921589282623617],
info_gain_list: [0.2467498197744391, 0.029222565658954647, 0.15183550136234136, 0.04812703040826927]
The feature with highest information gain is: Outlook

Entropy list: [0.6935361388961918, 0.9110633930116763, 0.7884504573082896, 0.8921589282623617],
info_gain_list: [0.2467498197744391, 0.029222565658954647, 0.15183550136234136, 0.04812703040826927]
The feature with highest information gain is: Outlook

Root node ==> Outlook

Attribute unique values ==> ['Overcast' 'Rain' 'Sunny']


Subtable for Attribute Value Overcast

    Outlook  Temp Humidity    Wind PlayTennis
0  Overcast   Hot     High    Weak        Yes
1  Overcast  Cool   Normal  Strong        Yes
2  Overcast  Mild     High  Strong        Yes
3  Overcast   Hot   Normal    Weak        Yes

Target labels: ['Yes']
Corresponding counts: [4]
---------------------------------------------------------------------------

Subtable for Attrib