## ID3

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 

df = pd.read_csv("data.csv")
df = df.iloc[:,1:]
def convertTemp(num):
    if int(num)>=80:
        return "Hot"
    elif int(num)<70:
        return "Cool"
    else:
        return "Mild"

def convertHumidity(num):
    if int(num)>80:
        return "High"
    else:
        return "Normal"

df['Temp'] = df['Temp'].astype(str)
df['Humidity'] = df['Humidity'].astype(str)

df['Temp'] = df['Temp'].apply(convertTemp)
df['Humidity'] = df['Humidity'].apply(convertHumidity)

print(df)

     Outlook  Temp Humidity    Wind Decision
0      Sunny   Hot     High    Weak       No
1      Sunny   Hot     High  Strong       No
2   Overcast   Hot   Normal    Weak      Yes
3       Rain  Mild     High    Weak      Yes
4       Rain  Cool   Normal    Weak      Yes
5       Rain  Cool   Normal  Strong       No
6   Overcast  Cool   Normal  Strong      Yes
7      Sunny  Mild     High    Weak       No
8      Sunny  Cool   Normal    Weak      Yes
9       Rain  Mild   Normal    Weak      Yes
10     Sunny  Mild   Normal  Strong      Yes
11  Overcast  Mild     High  Strong      Yes
12  Overcast   Hot   Normal    Weak      Yes
13      Rain  Mild   Normal  Strong       No


In [5]:
def entropy(data):
    p,n = 0,0
    x = data.value_counts("Decision")
    if len(x) > 1:
        if data.iat[0,-1] == "No":
            n,p = x[0],x[1]
        else:
            p,n = x[0],x[1]
    else:
        if data.iat[0,-1] == "No":
            n = x[0]
        else:
            p = x[0]
    p_ratio = p/(p+n)
    n_ratio = 1 - p_ratio
    entropy_p = -p_ratio*math.log2(p_ratio) if p_ratio != 0 else 0
    entropy_n = - n_ratio*math.log2(n_ratio) if n_ratio !=0 else 0
    return entropy_p + entropy_n

def info_gain(data,feature):
    unique_values = data[feature].unique()
    feature_weights = [len(data[data[feature] == val]) for val in unique_values]
    entropies = [entropy(data[data[feature] == val]) for val in unique_values]
    info_gain_feature = sum(feature_weights[i] / sum(feature_weights) * entropies[i] for i in range(len(unique_values)))
    info_gain = entropy(data) - info_gain_feature
    
    return info_gain



entropy(df)
print(info_gain(df,"Outlook"))
print(info_gain(df,"Temp"))
print(info_gain(df,"Humidity"))
print(info_gain(df,"Wind"))

0.24674981977443933
0.02922256565895487
0.10224356360985076
0.04812703040826949


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

class Node:
    def __init__(self, data, parent_feature=None, parent_feature_value=None):
        self.data = data  
        self.parent_feature = parent_feature  
        self.parent_feature_value = parent_feature_value  
        self.children = {}  

def id3_decision_tree(data, features, target, parent_feature=None, parent_feature_value=None):
    node = Node(data, parent_feature, parent_feature_value) 
    if len(data['Decision'].unique()) == 1:
        return data['Decision'].values[0]    
    if len(features) == 0:
        return data['Decision'].mode().values[0]    
    gains = {feature: info_gain(data, feature) for feature in features}    
    best_feature = max(gains, key=gains.get)    
    for value in data[best_feature].unique():
        child_data = data[data[best_feature] == value].drop([best_feature], axis=1)
        node.children[value] = id3_decision_tree(child_data, features.drop(best_feature), target, best_feature, value)
    return node

features = df.columns[:-1]
target = 'Decision'

decision_tree = id3_decision_tree(df, features, target)

def print_decision_tree(node, indent=""):
    if isinstance(node, Node):
        if node.parent_feature is not None:
            print(indent + "Feature: " + node.parent_feature)
            print(indent + "Parent Feature Value: " + str(node.parent_feature_value))
        
        if len(node.children) == 0:
            print(indent + "Decision: " + str(node.data['Decision'].values[0]))
        else:
            for value, child_node in node.children.items():
                print(indent + "Value: " + str(value))
                print_decision_tree(child_node, indent + "  ")
    else:
        print(indent + "Decision: " + str(node))
        
print_decision_tree(decision_tree)

Value: Sunny
  Feature: Outlook
  Parent Feature Value: Sunny
  Value: High
    Decision: No
  Value: Normal
    Decision: Yes
Value: Overcast
  Decision: Yes
Value: Rain
  Feature: Outlook
  Parent Feature Value: Rain
  Value: Weak
    Decision: Yes
  Value: Strong
    Decision: No


## CART

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 

df = pd.read_csv("data.csv")
df = df.iloc[:,1:]
def convertTemp(num):
    if int(num)>=80:
        return "Hot"
    elif int(num)<70:
        return "Cool"
    else:
        return "Mild"

def convertHumidity(num):
    if int(num)>80:
        return "High"
    else:
        return "Normal"

df['Temp'] = df['Temp'].astype(str)
df['Humidity'] = df['Humidity'].astype(str)

df['Temp'] = df['Temp'].apply(convertTemp)
df['Humidity'] = df['Humidity'].apply(convertHumidity)

print(df)

def gini(data):
    p,n = 0,0
    x = data.value_counts("Decision")
    if len(x) > 1:
        if data.iat[0,-1] == "No":
            n,p = x[0],x[1]
        else:
            p,n = x[0],x[1]
    else:
        if data.iat[0,-1] == "No":
            n = x[0]
        else:
            p = x[0]
    p_ratio = p/(p+n)
    n_ratio = 1 - p_ratio
    gini_p = (p_ratio)**2
    gini_n = (n_ratio)**2
    return 1 - gini_p - gini_n

def gini_feature(data,feature):
    unique_values = data[feature].unique()
    feature_weights = [len(data[data[feature] == val]) for val in unique_values]
    ginies = [gini(data[data[feature] == val]) for val in unique_values]
    gini_feature_val = sum(feature_weights[i] / sum(feature_weights) * ginies[i] for i in range(len(unique_values)))
    return gini_feature_val

entropy(df)
print(gini_feature(df,"Outlook"))
print(gini_feature(df,"Temp"))
print(gini_feature(df,"Humidity"))
print(gini_feature(df,"Wind"))

     Outlook  Temp Humidity    Wind Decision
0      Sunny   Hot     High    Weak       No
1      Sunny   Hot     High  Strong       No
2   Overcast   Hot   Normal    Weak      Yes
3       Rain  Mild     High    Weak      Yes
4       Rain  Cool   Normal    Weak      Yes
5       Rain  Cool   Normal  Strong       No
6   Overcast  Cool   Normal  Strong      Yes
7      Sunny  Mild     High    Weak       No
8      Sunny  Cool   Normal    Weak      Yes
9       Rain  Mild   Normal    Weak      Yes
10     Sunny  Mild   Normal  Strong      Yes
11  Overcast  Mild     High  Strong      Yes
12  Overcast   Hot   Normal    Weak      Yes
13      Rain  Mild   Normal  Strong       No
0.34285714285714286
0.44047619047619047
0.3936507936507937
0.42857142857142855


In [9]:
class Node:
    def __init__(self, data, parent_feature=None, parent_feature_value=None):
        self.data = data  
        self.parent_feature = parent_feature  
        self.parent_feature_value = parent_feature_value  
        self.children = {}  

def build_cart_decision_tree(data, features, target, parent_feature=None, parent_feature_value=None):
    node = Node(data, parent_feature, parent_feature_value)
    if len(data['Decision'].unique()) == 1:
        return data['Decision'].values[0]
    if len(features) == 0:
        return data['Decision'].mode().values[0]
    gini_indices = {feature: gini_feature(data, feature) for feature in features}
    best_feature = min(gini_indices, key=gini_indices.get)
    for value in data[best_feature].unique():
        child_data = data[data[best_feature] == value].drop([best_feature], axis=1)
        node.children[value] = build_cart_decision_tree(child_data, features.drop(best_feature), target, best_feature, value)
    return node
features = df.columns[:-1]
target = 'Decision'

decision_tree = build_cart_decision_tree(df, features, target)
def print_cart_decision_tree(node, indent=""):
    if isinstance(node, Node):
        if node.parent_feature is not None:
            print(indent + "Feature: " + node.parent_feature)
            print(indent + "Parent Feature Value: " + str(node.parent_feature_value))

        if len(node.children) == 0:
            print(indent + "Decision: " + str(node.data['Decision'].values[0]))
        else:
            for value, child_node in node.children.items():
                print(indent + "Value: " + str(value))
                print_cart_decision_tree(child_node, indent + "  ")
    else:
        print(indent + "Decision: " + str(node))
        
print_cart_decision_tree(decision_tree)


Value: Sunny
  Feature: Outlook
  Parent Feature Value: Sunny
  Value: High
    Decision: No
  Value: Normal
    Decision: Yes
Value: Overcast
  Decision: Yes
Value: Rain
  Feature: Outlook
  Parent Feature Value: Rain
  Value: Weak
    Decision: Yes
  Value: Strong
    Decision: No


## C4.5

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 

df = pd.read_csv("data.csv")
df = df.iloc[:,1:]
def convertTemp(num):
    if int(num)>=80:
        return "Hot"
    elif int(num)<70:
        return "Cool"
    else:
        return "Mild"

def convertHumidity(num):
    if int(num)>80:
        return "High"
    else:
        return "Normal"

df['Temp'] = df['Temp'].astype(str)
df['Humidity'] = df['Humidity'].astype(str)

df['Temp'] = df['Temp'].apply(convertTemp)
df['Humidity'] = df['Humidity'].apply(convertHumidity)

print(df)

def entropy(data):
    p,n = 0,0
    x = data.value_counts("Decision")
    if len(x) > 1:
        if data.iat[0,-1] == "No":
            n,p = x[0],x[1]
        else:
            p,n = x[0],x[1]
    else:
        if data.iat[0,-1] == "No":
            n = x[0]
        else:
            p = x[0]
    p_ratio = p/(p+n)
    n_ratio = 1 - p_ratio
    entropy_p = -p_ratio*math.log2(p_ratio) if p_ratio != 0 else 0
    entropy_n = - n_ratio*math.log2(n_ratio) if n_ratio !=0 else 0
    return entropy_p + entropy_n

def gain_ratio(data,feature):
    unique_values = data[feature].unique()
    feature_weights = [len(data[data[feature] == val]) for val in unique_values]
    entropies = [entropy(data[data[feature] == val]) for val in unique_values]
    info_gain_feature = sum(feature_weights[i] / sum(feature_weights) * entropies[i] for i in range(len(unique_values)))
    info_gain = entropy(data) - info_gain_feature
    split_info = sum(-1*feature_weights[i] / sum(feature_weights) * math.log2(feature_weights[i] / sum(feature_weights)) for i in range(len(unique_values)))
    gain_ratio = info_gain/split_info
    return gain_ratio



entropy(df)
print(gain_ratio(df,"Outlook"))
print(gain_ratio(df,"Temp"))
print(gain_ratio(df,"Humidity"))
print(gain_ratio(df,"Wind"))

     Outlook  Temp Humidity    Wind Decision
0      Sunny   Hot     High    Weak       No
1      Sunny   Hot     High  Strong       No
2   Overcast   Hot   Normal    Weak      Yes
3       Rain  Mild     High    Weak      Yes
4       Rain  Cool   Normal    Weak      Yes
5       Rain  Cool   Normal  Strong       No
6   Overcast  Cool   Normal  Strong      Yes
7      Sunny  Mild     High    Weak       No
8      Sunny  Cool   Normal    Weak      Yes
9       Rain  Mild   Normal    Weak      Yes
10     Sunny  Mild   Normal  Strong      Yes
11  Overcast  Mild     High  Strong      Yes
12  Overcast   Hot   Normal    Weak      Yes
13      Rain  Mild   Normal  Strong       No
0.15642756242117528
0.018772646222418813
0.1087366695918781
0.04884861551152082


In [12]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

class Node:
    def __init__(self, data, parent_feature=None, parent_feature_value=None):
        self.data = data  
        self.parent_feature = parent_feature  
        self.parent_feature_value = parent_feature_value  
        self.children = {} 

def c45_decision_tree(data, features, target, parent_feature=None, parent_feature_value=None):
    node = Node(data, parent_feature, parent_feature_value)    
    if len(data['Decision'].unique()) == 1:
        return data['Decision'].values[0]    
    if len(features) == 0:
        return data['Decision'].mode().values[0]    
    gain_ratios = {feature: gain_ratio(data, feature) for feature in features}
    best_feature = max(gain_ratios, key=gain_ratios.get)    
    for value in data[best_feature].unique():
        child_data = data[data[best_feature] == value].drop([best_feature], axis=1)
        node.children[value] = c45_decision_tree(child_data, features.drop(best_feature), target, best_feature, value)
    return node

features = df.columns[:-1]
target = 'Decision'
decision_tree = c45_decision_tree(df, features, target)

def print_c45_decision_tree(node, indent=""):
    if isinstance(node, Node):
        if node.parent_feature is not None:
            print(indent + "Feature: " + node.parent_feature)
            print(indent + "Parent Feature Value: " + str(node.parent_feature_value))
        
        if len(node.children) == 0:
            print(indent + "Decision: " + str(node.data['Decision'].values[0]))
        else:
            for value, child_node in node.children.items():
                print(indent + "Value: " + str(value))
                print_decision_tree(child_node, indent + "  ")
    else:
        print(indent + "Decision: " + str(node))

print_c45_decision_tree(decision_tree)

Value: Sunny
  Feature: Outlook
  Parent Feature Value: Sunny
  Value: High
    Decision: No
  Value: Normal
    Decision: Yes
Value: Overcast
  Decision: Yes
Value: Rain
  Feature: Outlook
  Parent Feature Value: Rain
  Value: Weak
    Decision: Yes
  Value: Strong
    Decision: No
