## 1- Training Step 

### Task 1 : `Read csv file`

In [37]:
import pandas as pd 
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [38]:
file = "data/golf.csv"
data = pd.read_csv(file)
# data.set_index('day')

In [120]:
# list all attributes 
for e in data.columns: 
    print(e)

outlook
temp
humidity
wind
play


In [121]:
# list values 
print(data)

     outlook  temp humidity   wind play
0      sunny   hot     high  False   no
1      sunny   hot     high   True   no
2   overcast   hot     high  False  yes
3       rain  mild     high  False  yes
4       rain  cool   normal  False  yes
5       rain  cool   normal   True   no
6   overcast  cool   normal   True  yes
7      sunny  mild     high  False   no
8      sunny  cool   normal  False  yes
9       rain  mild   normal  False  yes
10     sunny  mild   normal   True  yes
11  overcast  mild     high   True  yes
12  overcast   hot   normal  False  yes
13      rain  mild     high   True   no


### Task 2 :`Construction of Id3 Tree`

In [130]:
# define the Entropy function 
def calc_entropy(my_target):
    # Count the occurrences of each unique value in the target column
    value_counts = my_target.value_counts()
    
    # Calculate the probabilities of each unique value
    probabilities = value_counts / len(my_target)
    
    # Calculate entropy using the formula: -sum(p * log2(p))
    entropy_val = -sum(probabilities * np.log2(probabilities))
    
    return entropy_val

In [138]:
# define the information_gain function 
def calc_information_gain(data, split_attribute_name, target_name):
    # Calculate the entropy of the entire dataset based on the target variable
    total_entropy = calc_entropy(data[target_name])
    
    # Calculate the probabilities of each unique value of the split attribute
    unique_values, value_counts = np.unique(data[split_attribute_name], return_counts=True)
    probabilities = value_counts / len(data)
    
    # Calculate the weighted average entropy after splitting on each unique value of the split attribute
    weighted_entropies = []
    for value in unique_values:
        subset = data[data[split_attribute_name] == value]
        subset_entropy = calc_entropy(subset[target_name])
        weighted_entropies.append((len(subset) / len(data)) * subset_entropy)
    
    weighted_entropy = sum(weighted_entropies)
    
    # Step 4: Calculate the information gain
    info_gain_val = total_entropy - weighted_entropy
    
    return info_gain_val

In [139]:
def best_split(data, features, target_name):
    # Calculate information gain for each feature
    info_gain_values = [calc_information_gain(data, feature, target_name) for feature in features]
    
    # Find the index of the feature with the highest information gain
    best_feature_index = np.argmax(info_gain_values)
    
    # Get the best feature based on the index
    best_feature = features[best_feature_index]
    
    return best_feature

In [140]:
def split_data(data, split_attribute_name):
    unique_values = data[split_attribute_name].unique()
    subdatasets = {value: data[data[split_attribute_name] == value].dropna() for value in unique_values}
    return subdatasets

In [141]:
def build_tree(data, features, target_name):
    unique_classes, counts = np.unique(data[target_name], return_counts=True)
    
    # If all instances belong to the same class
    if len(unique_classes) == 1:
        return unique_classes[0]
    
    # If there are no more features to split on
    if len(features) == 0:
        return unique_classes[np.argmax(counts)]
    
    # Determine the best feature to split on
    best_feature = best_split(data, features, target_name)
    tree = {best_feature: {}}
    
    # Remove the best feature from the list of features
    remaining_features = [feature for feature in features if feature != best_feature]
    
    # Recursively build the tree for each split value
    for value, subdata in split_data(data, best_feature).items():
        tree[best_feature][value] = build_tree(subdata, remaining_features, target_name)
        
    return tree

In [142]:
def predict(query, tree):
    # Loop through each key in the query dictionary
    for key in query.keys():
        
        # Check if the key exists in the decision tree
        if key in tree:
            
            # Get the result based on the query value for the current key
            result = tree[key][query[key]]
            
            # If the result is another dictionary, recursively call predict
            if isinstance(result, dict):
                return predict(query, result)
            
            # If the result is a final outcome, return it
            else:
                return result

### Task 4 : `Displaying the ID3 Tree`

In [143]:
# Define a function in order ot convert all ID3 tree keys to strings 
def convert_keys_to_strings(dictionary):
    new_dict = {}
    
    for key, value in dictionary.items():
        # Convert numpy bool to Python bool
        if isinstance(key, np.bool_):
            key = bool(key)
        
        # Convert numpy generic type to scalar
        if isinstance(key, np.generic):
            key = np.asscalar(key)
        
        # Recursively convert nested dictionaries
        if isinstance(value, dict):
            new_dict[str(key)] = convert_keys_to_strings(value)
        else:
            new_dict[str(key)] = value
    
    return new_dict


In [144]:
# Get features and target name
features = data.columns.tolist()[:-1]  # Assuming the last column is the target
target_name = data.columns.tolist()[-1]

# Build the ID3 tree
tree = build_tree(data, features, target_name)

# Save the ID3 tree as json file
with open("sample.json", "w") as outfile: 
    json.dump(convert_keys_to_strings(tree), outfile)

# # Print the ID3 tree 
print(json.dumps(convert_keys_to_strings(tree), indent=4))

{
    "outlook": {
        "sunny": {
            "humidity": {
                "high": "no",
                "normal": "yes"
            }
        },
        "overcast": "yes",
        "rain": {
            "wind": {
                "False": "yes",
                "True": "no"
            }
        }
    }
}


In [None]:
def plot_tree(tree, parent_name, level=0):
    for key, value in tree.items():
        if isinstance(value, dict):
            plt.scatter(level, 0, color='g')
            plt.text(level, 0, key)
            plot_tree(value, key, level + 1)
        else:
            plt.scatter(level, 0, color='r' if value == 'yes' else 'b')
            plt.text(level, 0, f"{key}: {value}", color='r' if value == 'yes' else 'b')
            plt.plot([level, parent_name], [0, -1], color='k')
            
plt.figure(figsize=(10, 5))
plot_tree(tree, parent_name='Root')
plt.axis('off')
plt.show()

### Task 4 : `Define the Confusion Matrix denoted as m_app`

In [119]:
def compute_confusion_matrix(data, tree, target_name):
    # Initialize the confusion matrix
    tp = tn = fp = fn = 0
    
    # Make predictions using the tree
    predictions = data.apply(lambda row: predict(row, tree), axis=1)
    print("predictions based of S_app: ")
    print(predictions)
    print("\n")
    
    # Compare predictions with actual labels to populate the confusion matrix
    for i in range(len(predictions)):
        if predictions[i] == data.iloc[i][target_name]:
            if predictions[i] == "yes":
                tp += 1  # True Positive
            else:
                tn += 1  # True Negative
        else:
            if predictions[i] == "yes":
                fp += 1  # False Positive
            else:
                fn += 1  # False Negative
    
    # Create the confusion matrix
    m_app = np.array([[tp, fp], [fn, tn]])
    
    return m_app

# Example usage:
# Assuming 'data' is your training dataset and 'tree' is your constructed ID3 decision tree
m_app = compute_confusion_matrix(data, tree, target_name)

print("Confusion Matrix for Training (m_app):\n", m_app)


predictions based of S_app: 
0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
dtype: object


Confusion Matrix for Training (m_app):
 [[9 0]
 [0 5]]
