### Decision Tree (DFS) Predict

In [1]:
import pandas as pd
import numpy as np

In [2]:
query_df = pd.read_csv('Query.csv')
query_df

Unnamed: 0,Age,Income,Student,Credit_Rating
0,youth,high,no,fair
1,youth,high,no,excellent
2,middle aged,high,no,fair
3,senior,medium,no,fair


In [3]:
DF = pd.read_csv('Naive_Labelled.csv')

In [4]:
dataframe = pd.read_csv('Naive_Labelled.csv')
dataframe

Unnamed: 0,Age,Income,Student,Credit_Rating,Actual
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle aged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


In [5]:
# Datwhaframe with age == youth and exlcuding the column age..
dataframe[dataframe['Age'] == 'youth'].drop(columns = 'Age')

Unnamed: 0,Income,Student,Credit_Rating,Actual
0,high,no,fair,no
1,high,no,excellent,no
7,medium,no,fair,no
8,low,yes,fair,yes
10,medium,yes,excellent,yes


In [6]:
# Data_points where Age is equal to youth and Target = yes..
dataframe[(dataframe['Age'] == 'youth') & (dataframe['Actual'] == 'yes')]

Unnamed: 0,Age,Income,Student,Credit_Rating,Actual
8,youth,low,yes,fair,yes
10,youth,medium,yes,excellent,yes


In [7]:
# Datapoints with target class yes....
dataframe[dataframe['Actual']=='yes']

Unnamed: 0,Age,Income,Student,Credit_Rating,Actual
2,middle aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
6,middle aged,low,yes,excellent,yes
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middle aged,medium,no,excellent,yes
12,middle aged,high,yes,fair,yes


In [8]:
# Count of datapoints with Target class yes...
count_ = dataframe[dataframe['Actual']=='yes'][['Age']].count()
count_['Age']

9

In [9]:
dataframe.describe()

Unnamed: 0,Age,Income,Student,Credit_Rating,Actual
count,14,14,14,14,14
unique,3,3,2,2,2
top,youth,medium,no,fair,yes
freq,5,6,7,8,9


In [10]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            14 non-null     object
 1   Income         14 non-null     object
 2   Student        14 non-null     object
 3   Credit_Rating  14 non-null     object
 4   Actual         14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


In [11]:
# Node Structure 
class Node:
        def __init__(self,attribute):
            self.attribute = attribute
            self.node_dict = {}

In [12]:
class DecisionTree:
    def __init__ (self):
        self.root: Node = None
        self.info_D = 0
        
    def fit(self,df: pd.DataFrame):
        self.df = df
        # find the unique Target classes
        self.Target_classes = df['Actual'].unique()
        # Calculate Info_D
        for class_ in self.Target_classes:
            p_i = len(df[df['Actual']==class_])/len(df)
            self.info_D+= -(p_i * np.log2 (p_i))
        print('Info_D: ',self.info_D)

        # Since root is none initially....
        self.root = Node(self.getAttribute(df))
        self.constructTree(self.root,df)
        # Follow the steps until the tree completes....

    def constructTree(self,node: Node,df: pd.DataFrame):
        
        if node.attribute in self.Target_classes:
            return node

        print('Node: ',node.attribute)
        for category in df[node.attribute].unique():
            # Before passing the modified dataframe drop the column for which
            # the categories are iterated..
            # Returns the attribute for the category            
            attrib  = self.getAttribute(df[df[node.attribute]==category].drop(columns = node.attribute))
            # Add these attributes as leaf nodes in the tree..
            node.node_dict[category] = self.constructTree(Node(attrib),df[df[node.attribute]==category].drop(columns = node.attribute))
            print('category: ',category)
            print('attrib: ',node.node_dict[category].attribute)
            # node.edgeName.append(category)

        return node # Should return this node in order to connect the node with the parent...
            
    def getAttribute(self,modified_df: pd.DataFrame):

        # No need to find gain if the datapoints has target classes with only yes/ only no as target class...
        cat = modified_df['Actual'].unique()
        if len(cat)==1:
            return cat[0]
            
        info_dict = {}
        gain_dict = {}
        for feature in modified_df.columns:
            #X[feature].unique() Find the unique categories
            if feature!= 'Actual':
                info_gain = 0
                for category in modified_df[feature].unique():
                    # info_gain for each category of a feature
                    info_gain+=len(modified_df[modified_df[feature]==category])/len(modified_df) * self.info_dj(feature,category,modified_df)
    
                info_dict[feature] = info_gain
        
        
        for key,value in info_dict.items():
            gain_dict[key] = self.info_D - value

        print('Gain dict: ',gain_dict)

        return max(gain_dict, key = lambda k: gain_dict[k])
        
    def info_dj(self,feature: str,category: str,df: pd.DataFrame):
        sum_ = 0
        
        for class_ in self.Target_classes:
            p_i = len(df[(df[feature]==category) & (df['Actual']==class_)])/len(df[df[feature]==category])
            if p_i!=0:
                sum_+= p_i * np.log2(p_i)
            
        return -sum_
        
    def predict(self,query: pd.DataFrame):
        # Traverse with the tree......
        print('Prediction....')
        predicted_val = []
        
        # Should traverse until the attribute matches with the target classes...
        for i in range(len(query)):
            currNode: Node = self.root
            while(True):
                if (currNode.attribute in self.Target_classes):
                    # There'll be atleast 1 node....
                    predicted_val.append(currNode.attribute)
                    break
                print('attrib: ',currNode.attribute)
                currNode = currNode.node_dict[query.loc[i][currNode.attribute]]
                
        print(predicted_val)  
        query['prediction'] = predicted_val
        return query
        

In [13]:
model = DecisionTree()
model.fit(DF.copy())
print(model.predict(query_df.copy()))

Info_D:  0.9402859586706311
Gain dict:  {'Age': 0.24674981977443933, 'Income': 0.02922256565895487, 'Student ': 0.15183550136234159, 'Credit_Rating': 0.04812703040826949}
Node:  Age
Gain dict:  {'Income': 0.5402859586706311, 'Student ': 0.9402859586706311, 'Credit_Rating': -0.01069154176206255}
Node:  Student 
category:  no
attrib:  no
category:  yes
attrib:  yes
category:  youth
attrib:  Student 
category:  middle aged
attrib:  yes
Gain dict:  {'Income': -0.01069154176206255, 'Student ': -0.01069154176206255, 'Credit_Rating': 0.9402859586706311}
Node:  Credit_Rating
category:  fair
attrib:  yes
category:  excellent
attrib:  no
category:  senior
attrib:  Credit_Rating
Prediction....
attrib:  Age
attrib:  Student 
attrib:  Age
attrib:  Student 
attrib:  Age
attrib:  Age
attrib:  Credit_Rating
['no', 'no', 'yes', 'yes']
           Age  Income Student  Credit_Rating prediction
0        youth    high       no          fair         no
1        youth    high       no     excellent         no