In [6]:
import pandas as pd 
import numpy as np
import sklearn as sk

In [48]:
train_data_m= pd.read_csv("ID3_From_Scratch-main/dataset.csv")
test_data_m = pd.read_csv("ID3_From_Scratch-main/dataset _test.csv")

In [47]:
test_data_m.tail(10)

Unnamed: 0,Id,AI,Stat,OOP,SAD,Recomended
192,101233,4.0,3.5,3.5,4.0,DL
193,101234,3.5,3.75,3.25,3.25,DL
194,101235,3.25,3.75,3.5,3.0,Stat
195,101236,3.5,4.0,2.75,3.25,DL
196,101237,3.5,3.0,4.0,3.75,AI
197,101238,3.75,3.0,4.0,3.25,AI
198,101239,3.5,4.0,3.75,3.0,DL
199,101240,3.0,3.5,3.0,3.0,Stat
200,101241,4.0,3.25,3.5,4.0,AI
201,101242,3.25,4.0,2.75,4.0,Stat


In [49]:
train_data_m.drop('Id',axis=1, inplace=True)
test_data_m.drop('Id',axis=1, inplace=True)

In [13]:
#train_data_m= train_data_m.head(10)

In [14]:
train_data_m

Unnamed: 0,AI,Stat,OOP,SAD,Recomended
0,3.25,3.00,3.50,3.00,OOP
1,3.00,3.75,3.50,3.50,Stat
2,3.50,3.00,3.25,3.50,AI
3,2.75,3.00,3.75,3.25,OOP
4,2.75,2.75,3.75,3.25,OOP
...,...,...,...,...,...
995,3.75,3.00,4.00,3.25,AI
996,3.50,4.00,3.75,3.00,DL
997,3.00,3.50,3.00,3.00,Stat
998,4.00,3.25,3.50,4.00,AI


In [15]:
train_data_m.columns

Index(['AI', 'Stat', 'OOP', 'SAD', 'Recomended'], dtype='object')

In [16]:
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0]
    #print('total_row',total_row)
    total_entr = 0
    
    for c in class_list:
        #print('c',c)
        total_class_count = train_data[train_data[label] == c].shape[0]
        #print('total_class_count',total_class_count)
        if total_class_count>0:
            total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
            #print('total_class_entr',total_class_entr)
            total_entr += total_class_entr
    #print('total_entr',total_entr)
    return total_entr

In [17]:
def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
    
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count
            entropy_class = - probability_class * np.log2(probability_class) 
        
        entropy += entropy_class
        
    return entropy

In [18]:
def calc_info_gain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique()
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy
        
    return calc_total_entropy(train_data, label, class_list) - feature_info

In [19]:
def find_most_informative_feature(train_data, label, class_list):
    feature_list = set(train_data.columns.drop(label))
    #print(feature_list)
    max_info_gain = -1
    max_info_feature = None
    for feature in feature_list:  
        feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
        if max_info_gain < feature_info_gain:
            max_info_gain = feature_info_gain
            max_info_feature = feature
    return max_info_feature

In [20]:
def generate_sub_tree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
    tree = {}
    
    for feature_value, count in feature_value_count_dict.iteritems():
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        
        assigned_to_node = False
        for c in class_list:
            class_count = feature_value_data[feature_value_data[label] == c].shape[0]

            if class_count == count:
                tree[feature_value] = c
                train_data = train_data[train_data[feature_name] != feature_value]
                assigned_to_node = True
        if not assigned_to_node:
            tree[feature_value] = "?"
            
    return tree, train_data

In [21]:
def make_tree(root, prev_feature_value, train_data, label, class_list):
    #print(prev_feature_value)
    if train_data.shape[0] != 0:
        max_info_feature = find_most_informative_feature(train_data, label, class_list)
        tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list)
        next_root = None
        
        if prev_feature_value != None:
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else:
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()):
            if branch == "?":
                feature_value_data = train_data[train_data[max_info_feature] == node]
                make_tree(next_root, node, feature_value_data, label, class_list)

In [22]:
def id3(train_data_m, label):
    train_data = train_data_m.copy()
    tree = {}
    class_list = train_data[label].unique()
    make_tree(tree, None, train_data_m, label, class_list)
    
    return tree

In [29]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = next(iter(tree))
        feature_value = instance[root_node]
        if feature_value in tree[root_node]:
            return predict(tree[root_node][feature_value], instance)
        else:
            return None

In [30]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows():
        result = predict(tree, test_data_m.iloc[index])
        if result == test_data_m[label].iloc[index]:
            correct_preditct += 1
        else:
            wrong_preditct += 1
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)
    return accuracy

In [25]:
train_data_m.columns

Index(['AI', 'Stat', 'OOP', 'SAD', 'Recomended'], dtype='object')

In [53]:
tree = id3(train_data_m, 'Recomended')
tree

{'AI': {3.25: {'Stat': {3.0: {'OOP': {3.5: 'OOP',
      4.0: {'SAD': {3.0: 'OOP', 3.25: 'OOP', 3.75: 'SE'}},
      3.0: {'SAD': {3.5: 'SAD', 3.25: 'BCS', 3.0: 'BCS'}},
      3.25: {'SAD': {2.75: 'BCS', 3.0: 'BCS', 3.75: 'SAD', 3.25: 'BCS'}},
      3.75: {'SAD': {3.25: 'OOP', 3.75: 'SE'}},
      2.75: {'SAD': {2.75: 'BCS', 3.0: 'BCS', 4.0: 'SAD', 3.25: 'BCS'}}}},
    3.25: {'SAD': {2.75: {'OOP': {2.75: 'BCS',
        3.75: 'OOP',
        3.5: 'OOP',
        4.0: 'OOP',
        3.25: 'BCS'}},
      3.25: {'OOP': {3.25: 'BCS', 2.75: 'BCS', 3.0: 'BCS', 4.0: 'OOP'}},
      4.0: {'OOP': {3.0: 'SAD', 4.0: 'SE'}},
      3.75: 'SAD',
      3.5: 'SE',
      3.0: {'OOP': {3.25: 'BCS', 3.0: 'BCS', 4.0: 'OOP'}}}},
    2.75: {'OOP': {3.0: 'BCS',
      3.25: {'SAD': {3.5: 'SAD', 3.75: 'SAD', 3.0: 'BCS'}},
      3.75: {'SAD': {3.5: 'SE', 3.0: 'OOP', 2.75: 'OOP', 3.25: 'OOP'}},
      2.75: 'SAD',
      3.5: 'OOP',
      4.0: {'SAD': {2.75: 'OOP', 3.5: 'SE', 4.0: 'SE'}}}},
    4.0: 'Stat',
    3.75: 'St

In [54]:
tree_copy=tree.copy()

In [55]:
keys= list(tree_copy['AI'].keys())
'''
for i in keys:
    i=str(i)
    '''
print(keys)

[3.25, 3.0, 3.5, 2.75, 4.0, 3.75]


In [56]:
i=1
while i<len(keys):
    if tree_copy['AI'][keys[i-1]]==tree_copy['AI'][keys[i]]:
        nk=str(keys[i-1])+','+str(keys[i])
        tree_copy['AI'].update({nk:tree_copy['AI'][keys[i-1]]})
        del tree_copy['AI'][keys[i-1]]
        del tree_copy['AI'][keys[i]]
        
    i+=1 
tree_copy

{'AI': {3.25: {'Stat': {3.0: {'OOP': {3.5: 'OOP',
      4.0: {'SAD': {3.0: 'OOP', 3.25: 'OOP', 3.75: 'SE'}},
      3.0: {'SAD': {3.5: 'SAD', 3.25: 'BCS', 3.0: 'BCS'}},
      3.25: {'SAD': {2.75: 'BCS', 3.0: 'BCS', 3.75: 'SAD', 3.25: 'BCS'}},
      3.75: {'SAD': {3.25: 'OOP', 3.75: 'SE'}},
      2.75: {'SAD': {2.75: 'BCS', 3.0: 'BCS', 4.0: 'SAD', 3.25: 'BCS'}}}},
    3.25: {'SAD': {2.75: {'OOP': {2.75: 'BCS',
        3.75: 'OOP',
        3.5: 'OOP',
        4.0: 'OOP',
        3.25: 'BCS'}},
      3.25: {'OOP': {3.25: 'BCS', 2.75: 'BCS', 3.0: 'BCS', 4.0: 'OOP'}},
      4.0: {'OOP': {3.0: 'SAD', 4.0: 'SE'}},
      3.75: 'SAD',
      3.5: 'SE',
      3.0: {'OOP': {3.25: 'BCS', 3.0: 'BCS', 4.0: 'OOP'}}}},
    2.75: {'OOP': {3.0: 'BCS',
      3.25: {'SAD': {3.5: 'SAD', 3.75: 'SAD', 3.0: 'BCS'}},
      3.75: {'SAD': {3.5: 'SE', 3.0: 'OOP', 2.75: 'OOP', 3.25: 'OOP'}},
      2.75: 'SAD',
      3.5: 'OOP',
      4.0: {'SAD': {2.75: 'OOP', 3.5: 'SE', 4.0: 'SE'}}}},
    4.0: 'Stat',
    3.75: 'St

In [51]:
accuracy = evaluate(tree, test_data_m, 'Recomended')
print("accuracy:", accuracy)

accuracy: 0.8960396039603961
