# HW1-2
## ID3 - Part 2
### Fatemeh Rafiee, Reyhaneh Ahani

##### Comments are fully available in ID3 - Part 1 (Since they're almost similar)

In [1]:
import math
import numpy as np
import pandas as pd
from collections import deque
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
class Node:
    def __init__(self):
        self.value = None
        self.next = None
        self.childs = None

class DecisionTree:
    def __init__(self, X, labels, features, max_depth=8, measure='entropy'):
        self.X = X
        self.features = features
        self.labels = labels
        self.node = None
        self.measure = measure
        self.max_depth = max_depth
        self.depth = 0
        
    def inf_gain_cal(self, x, feature_idx):
        if self.measure == 'entropy':
            total_entropy = self.entropy_cal(x)
        elif self.measure == 'gini':
            total_entropy = self.gini_cal(x)
            
        x_features, feature_cnt = [], []
        
        for i in x:
            x_features.append(self.X[i][feature_idx])
            
        for j in list(set(x_features)):
            feature_cnt.append(x_features.count(j))
         
        feature_v_id = []
        for feature_item in list(set(x_features)):
            tmp = []
            for i, x_i in enumerate(x_features):
                if x_i == feature_item:
                    tmp.append(x[i])
            feature_v_id.append(tmp) 
        entropy = []
        for n_i, Class in zip(feature_cnt, feature_v_id):
            entropy.append(n_i/len(x)*self.entropy_cal(Class))
            
        return total_entropy - sum(entropy)
        
    def entropy_cal(self, x):
        labels, label_cnt, sigma_entropy = [], [], 0
        
        for i in x:
            labels.append(self.labels[i])
            
        for j in list(set(self.labels)):
            label_cnt.append(labels.count(j))

        for cnt in label_cnt:
            if cnt != 0:
                sigma_entropy += -cnt/len(x)*math.log(cnt/len(x), 2)
        return sigma_entropy
    
    def gini_cal(self, x): 
        labels, label_cnt, sigma_gini = [], [], 0
        
        for i in x:
            labels.append(self.labels[i])
            
        for j in list(set(self.labels)):
            label_cnt.append(labels.count(j))

        for cnt in label_cnt:
            if cnt != 0:
                sigma_gini += (cnt/len(x)) ** 2
        return 1 - sigma_gini
    
    def opt_feature(self, x, features):    
        features_entropy = [self.inf_gain_cal(x, feature_id) for feature_id in features]
        max_id = features[features_entropy.index(max(features_entropy))]
        return self.features[max_id], max_id

    def rec(self, x_idx, feature_idx, node):     
        self.depth += 1
        
        if not node:
            node = Node() 
        labels_tmp = []
        for i in x_idx:
            labels_tmp.append(self.labels[i])
        
        if len(set(labels_tmp)) == 1:
            node.value = self.labels[x_idx[0]]
            return node

        if len(feature_idx) == 0 or self.max_depth < self.depth:
            node.value = max(labels_tmp, key=labels_tmp.count) # voting
            return node
        
        best_feature_name, best_feature_idx = self.opt_feature(x_idx, feature_idx)
        node.value = best_feature_name
        node.childs = []
        unique_values = []
        for x in x_idx:
            unique_values.append(self.X[x][best_feature_idx])
        
        unique_values = list(set(unique_values))
        
        for value in unique_values:
            child = Node()
            child.value = value 
            node.childs.append(child) 
           
            child_idx = []
            for i in x_idx:
                if self.X[i][best_feature_idx] == value:
                    child_idx.append(i)
            
            if not child_idx:
                child.next = max(unique_values, key=unique_values.count)
            else:
                if feature_idx and best_feature_idx in feature_idx: 
                    feature_idx.pop(feature_idx.index(best_feature_idx))
                child.next = self.rec(child_idx, feature_idx, child.next)   
        return node    

    def fit(self):
        x_idx = list(range(len(self.X)))
        features_idx = list(range(len(self.features)))
        self.node = self.rec(x_idx, features_idx, self.node)
            
    def predict(self, X_test):
        result = [None for _ in range(len(X_test))]
        for i, x_i in enumerate(X_test):
            nodes = deque()
            nodes.append(self.node)
            current_feature = None
            last_child = None
            while len(nodes) > 0:
                node = nodes.popleft()
                for feature in self.features:
                    if node.value == feature:
                        current_feature = feature
                if node.childs:
                    for child in node.childs:
                        if x_i[self.features.index(current_feature)] == child.value:
                            last_child = child.next.value
                            nodes.append(child.next)
                else:
                    result[i] = last_child
                    break
        return result
                    
            
    def accuracy(self, X_test, y_test):
        misclassified = 0
        y_pred = self.predict(X_test)
        for h in range(len(y_pred)):
            if y_pred[h] != y_test[h]:
                misclassified += 1
                
        return (len(y_pred) - misclassified) / len(y_pred)

In [3]:
df = pd.DataFrame(pd.read_csv("nursery.csv"))
# df

In [4]:
# df.describe()

In [5]:
df.value_counts('final evaluation')

final evaluation
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
dtype: int64

In [6]:
df = df.rename(columns={'final evaluation': 'final_evaluation'})

In [7]:
df[df['final_evaluation']=='recommend']

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final_evaluation
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend


In [8]:
df.drop(df[(df.final_evaluation == 'recommend')].index)

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final_evaluation
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
5,usual,proper,complete,1,convenient,convenient,slightly_prob,not_recom,not_recom
6,usual,proper,complete,1,convenient,convenient,problematic,recommended,priority
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


In [9]:
df = shuffle(df)

X = np.array(df.drop('final_evaluation', axis=1).copy())
y = np.array(df['final_evaluation'].copy())

features = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health']

In [10]:
cases = [
    {'measure': 'entropy', 'max_depth': 6, 'test': 0.5},
    {'measure': 'entropy', 'max_depth': 8, 'test': 0.5},
    {'measure': 'entropy', 'max_depth': 6, 'test': 0.25},
    {'measure': 'entropy', 'max_depth': 8, 'test': 0.25},
    {'measure': 'gini', 'max_depth': 6, 'test': 0.5},
    {'measure': 'gini', 'max_depth': 8, 'test': 0.5},
    {'measure': 'gini', 'max_depth': 6, 'test': 0.25},
    {'measure': 'gini', 'max_depth': 8, 'test': 0.25},
]

In [11]:
for case in cases:
    print(f"measure: {case['measure']}, test: {case['test']}, max_depth: {case['max_depth']}")
    # using the train test split function
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=18, test_size=case['test'])

    dt = DecisionTree(X_train, y_train, features, max_depth=case['max_depth'], measure=case['measure'])
    dt.fit()
    print(dt.accuracy(X_test, y_test))

measure: entropy, test: 0.5, max_depth: 6
0.7921296296296296
measure: entropy, test: 0.5, max_depth: 8
0.7933641975308642
measure: entropy, test: 0.25, max_depth: 6
0.7972222222222223
measure: entropy, test: 0.25, max_depth: 8
0.7975308641975308
measure: gini, test: 0.5, max_depth: 6
0.7921296296296296
measure: gini, test: 0.5, max_depth: 8
0.7933641975308642
measure: gini, test: 0.25, max_depth: 6
0.7972222222222223
measure: gini, test: 0.25, max_depth: 8
0.7975308641975308
