In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Question - 1

In [8]:
df = pd.read_csv('weather_data.csv', names=['day', 'outlook', 'temp', 'humidity', 'wind', 'decision'], header=0)
df

Unnamed: 0,day,outlook,temp,humidity,wind,decision
0,1,Sunny,85,85,Weak,No
1,2,Sunny,80,90,Strong,No
2,3,Overcast,83,78,Weak,Yes
3,4,Rain,70,96,Weak,Yes
4,5,Rain,68,80,Weak,Yes
5,6,Rain,65,70,Strong,No
6,7,Overcast,64,65,Strong,Yes
7,8,Sunny,72,95,Weak,No
8,9,Sunny,69,70,Weak,Yes
9,10,Rain,75,80,Weak,Yes


In [19]:
def calc_entropy(df, target_col):
    total = len(df)
    vals = df[target_col].value_counts()
    entropy = 0
    for val in vals:
        entropy += (val / total) * np.log(val / total)
    entropy *= -1
    return entropy

def calc_info_gain(df, predictor_col, target_col):
    total = len(df)
    initial_entropy = calc_entropy(df, target_col)
    cats = df[predictor_col].value_counts()
    entropy_after_split = 0
    for cat in cats.index:
        entropy_after_split += (cats[cat] / total) * calc_entropy(df[df[predictor_col] == cat], target_col)
        
    return initial_entropy - entropy_after_split

def calc_gain_ratio(df, predictor_col, target_col):
    total = len(df)
    info_gain = calc_info_gain(df, predictor_col, target_col)
    normalize = 0
    vals = df[predictor_col].value_counts()
    for val in vals:
        normalize += (val / total) * np.log(val / total)
    normalize *= -1
    
    gain_ratio = info_gain / normalize
    #print(info_gain, normalize)
    
    return gain_ratio
    
calc_gain_ratio(df, 'outlook', 'decision')

0.15642756242117511

In [20]:
for col in df.columns[:-1]:
    gain = calc_gain_ratio(df, col, 'decision')
    print(f"Information Gain for column {col}: {gain}")

Information Gain for column day: 0.24696566984684304
Information Gain for column outlook: 0.15642756242117511
Information Gain for column temp: 0.2264367373502071
Information Gain for column humidity: 0.13529065115207334
Information Gain for column wind: 0.04884861551152079


In [22]:
# Function to recursively build the decision tree using ID3 algorithm
def c45(data, original_data, features, target_attribute, parent_node_class=None):
    # Base case 1: If all target values are the same, return the class
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    
    # Base case 2: If dataset is empty, return the mode target feature value from the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])]
    
    # Base case 3: If the feature space is empty, return the parent node class
    elif len(features) == 0:
        return parent_node_class
    
    # Recursive case: Build the tree
    else:
        # Store the parent node class (the mode target feature value)
        parent_node_class = np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
        
        # Select the feature that best splits the dataset
        item_values = [calc_gain_ratio(data, feature, target_attribute) for feature in features]  # List of information gains
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure
        tree = {best_feature: {}}
        
        # Remove the best feature from the feature list
        features = [i for i in features if i != best_feature]
        
        # Grow the tree branch by branch for each feature in the best split
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = c45(sub_data, original_data, features, target_attribute, parent_node_class)
            tree[best_feature][value] = subtree
        
        return tree
    
    
def c45_predict(tree, data):
    sub_tree = tree
    while isinstance(sub_tree, dict):
        feature = list(sub_tree.keys())[0]
        sub_tree = sub_tree[feature][data[feature]]
    return sub_tree

In [28]:
tree = c45(df, df, df.columns[: -1], 'decision')
tree

{'day': {1: 'No',
  2: 'No',
  3: 'Yes',
  4: 'Yes',
  5: 'Yes',
  6: 'No',
  7: 'Yes',
  8: 'No',
  9: 'Yes',
  10: 'Yes',
  11: 'Yes',
  12: 'Yes',
  13: 'Yes',
  14: 'No'}}

# Question - 2

In [41]:
def calc_gini(df, target_col):
    total = len(df)
    vals = df[target_col].value_counts()
    gini = 1
    for val in vals:
        gini -= (val / total) ** 2
    return gini

def calc_gini_gain(df, predictor_col, target_col):
    total = len(df)
    initial_gini_index = calc_gini(df, target_col)
    cats = df[predictor_col].value_counts()
    gini_index_after_split = 0
    for cat in cats.index:
        gini_index_after_split += (cats[cat] / total) * calc_gini(df[df[predictor_col] == cat], target_col)
    gini_gain = initial_gini_index - gini_index_after_split
    return gini_gain
        

# Function to recursively build the decision tree using ID3 algorithm
def cart(data, original_data, features, target_attribute, parent_node_class=None):
    # Base case 1: If all target values are the same, return the class
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    
    # Base case 2: If dataset is empty, return the mode target feature value from the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])]
    
    # Base case 3: If the feature space is empty, return the parent node class
    elif len(features) == 0:
        return parent_node_class
    
    # Recursive case: Build the tree
    else:
        # Store the parent node class (the mode target feature value)
        parent_node_class = np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
        
        # Select the feature that best splits the dataset
        item_values = [calc_gini_gain(data, feature, target_attribute) for feature in features]  # List of information gains
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure
        tree = {best_feature: {}}
        
        # Remove the best feature from the feature list
        features = [i for i in features if i != best_feature]
        
        # Grow the tree branch by branch for each feature in the best split
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = cart(sub_data, original_data, features, target_attribute, parent_node_class)
            tree[best_feature][value] = subtree
        
        return tree
    
    
def cart_predict(tree, data):
    sub_tree = tree
    while isinstance(sub_tree, dict):
        feature = list(sub_tree.keys())[0]
        sub_tree = sub_tree[feature][data[feature]]
    return sub_tree

In [42]:
for col in df.columns[:-1]:
    gain = calc_gini_index(df, col, 'decision')
    print(f"Gini Gain for column {col}: {gain}")

Gini Gain for column day: 0.4591836734693877
Gini Gain for column outlook: 0.11632653061224485
Gini Gain for column temp: 0.3877551020408163
Gini Gain for column humidity: 0.19727891156462585
Gini Gain for column wind: 0.030612244897959162


In [44]:
tree = cart(df, df, df.columns[1: -1], 'decision')
tree

{'temp': {64: 'Yes',
  65: 'No',
  68: 'Yes',
  69: 'Yes',
  70: 'Yes',
  71: 'No',
  72: {'outlook': {'Overcast': 'Yes', 'Sunny': 'No'}},
  75: 'Yes',
  80: 'No',
  81: 'Yes',
  83: 'Yes',
  85: 'No'}}

# Question - 3

In [62]:
df = pd.read_csv('loan_data.csv')
df

Unnamed: 0,income,credit,approved
0,low,good,yes
1,low,bad,no
2,medium,good,yes
3,medium,bad,yes
4,high,good,yes
5,high,bad,no


In [58]:
from pprint import pprint

tree = c45(df, df, df.columns[: -1], 'approved')
print("C4.5 Algorithm")
pprint(tree)

print("\nCART Algorithm")
tree = cart(df, df, df.columns[: -1], 'approved')
pprint(tree)

C4.5 Algorithm
{'credit': {'bad': {'income': {'high': 'no', 'low': 'no', 'medium': 'yes'}},
            'good': 'yes'}}

CART Algorithm
{'credit': {'bad': {'income': {'high': 'no', 'low': 'no', 'medium': 'yes'}},
            'good': 'yes'}}


In [60]:
cart_predict(tree, {'income': 'medium', 'credit': 'bad'})

'yes'

In [67]:
from sklearn.tree import DecisionTreeClassifier

# Initialize DecisionTreeClassifier (CART with Gini impurity)
clf = DecisionTreeClassifier(criterion='gini', random_state=42)

# Convert categorical variables to numeric using one-hot encoding
df_encoded = pd.get_dummies(df.drop('approved', axis=1))  # Drop target column for one-hot encoding
x = df_encoded
y = df['approved']

print(x)

# Train the model
clf.fit(x, y)

# Make predictions
y_pred = clf.predict([[False, False, True, True, False]])

y_pred

   income_high  income_low  income_medium  credit_bad  credit_good
0        False        True          False       False         True
1        False        True          False        True        False
2        False       False           True       False         True
3        False       False           True        True        False
4         True       False          False       False         True
5         True       False          False        True        False




array(['yes'], dtype=object)