In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import os

In [11]:
# Cell 2: Entropy Function
def entropy(labels, base=None):
    vc = pd.Series(labels).value_counts(normalize=True, sort=False)
    base = 2 if base is None else base  # Fixed!
    return -(vc * np.log(vc)/np.log(base)).sum()

In [18]:
# Cell 3: Misclassification Function
def misclassification(labels):
    # Get the most common class
    if isinstance(labels, pd.Series):
        # Count how often each class appears
        value_counts = labels.value_counts()
        # Get the probability of the most common class
        max_prob = value_counts.max() / len(labels)
        # Misclassification impurity
        return 1 - max_prob
    else:
        raise ValueError('Input must be a Pandas Series.')

In [13]:
# Cell 4: Gini Function
def gini(y):
    if isinstance(y, pd.Series):
        p = y.value_counts() / y.shape[0]
        gini = 1 - np.sum(p ** 2)
        return gini
    else:
        raise ValueError('Input must be a Pandas Series.')

In [None]:
# Cell 5: Gain Function
def calculate_gain(subset_splits, criterion="gini"):
    # Step 1: Calculate total number of samples
    total_samples = sum(len(labels) for _, labels in subset_splits)
    
    # Step 2: Combine all labels to calculate impurity before split
    all_labels = pd.concat([labels for _, labels in subset_splits])
    
    # Step 3: Calculate impurity before split
    if criterion == "gini":
        impurity_before = gini(all_labels)
    elif criterion == "entropy":
        impurity_before = entropy(all_labels)
    elif criterion == "misclassification":
        impurity_before = misclassification(all_labels)
    else:
        raise ValueError("Unknown criterion")
    
    # Step 4: Calculate weighted average impurity after split
    impurity_after = 0
    for value, labels in subset_splits:
        # Weight is the proportion of samples in this subset
        weight = len(labels) / total_samples
        
        # Calculate impurity for this subset
        if criterion == "gini":
            subset_impurity = gini(labels)
        elif criterion == "entropy":
            subset_impurity = entropy(labels)
        elif criterion == "misclassification":
            subset_impurity = misclassification(labels)
        
        # Add weighted impurity
        impurity_after += weight * subset_impurity
    
    # Step 5: Information gain = reduction in impurity
    gain = impurity_before - impurity_after
    
    return gain

In [None]:
# Cell 6: Dataset Split Function
def dataset_split_by_feature(dataset, feature, label):
    splits = []
    
    # Get all unique values of this feature
    unique_values = dataset[feature].unique()
    
    # For each unique value, create a subset
    for value in unique_values:
        # Get rows where feature equals this value
        subset = dataset[dataset[feature] == value]
        # Get just the labels for this subset
        subset_labels = subset[label]
        # Store the value and its corresponding labels
        splits.append((value, subset_labels))
    
    return splits

In [None]:
# Cell 7: Find Best Split Function
def find_best_split(dataset, label, criterion="gini"):
    # for each split calculate gain, return best feature to split and the gain
    best_feature = None
    best_gain = -1  # Start with -1 so any positive gain is better
    
    # Get all feature names (all columns except the label)
    features = [col for col in dataset.columns if col != label]
    
    # Try splitting on each feature
    for feature in features:
        # Split the dataset by this feature
        splits = dataset_split_by_feature(dataset, feature, label)
        
        # Calculate gain for this split
        gain = calculate_gain(splits, criterion)
        
        # If this is the best gain so far, remember it
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
    
    return best_feature, best_gain

In [None]:
if __name__ == '__main__':
    name = "DeLaRiva_Skyye"
    os.makedirs(name=name, exist_ok=True)
    
    # TEST WITH BALLOONS FIRST
    data = pd.read_csv("agaricus-lepiota.csv")
    
    # Print to see what the data looks like
    print(data.head())
    print(f"\nColumns: {data.columns.tolist()}")
    
    label = "class"
    
    criteria = ["gini", "entropy", "misclassification"]
    f = open(f"{name}/mushrooms.txt", "w")
    for criterion in criteria:
        best_feature, best_gain = find_best_split(data, label, criterion)
        print(f"Best feature: {best_feature} Using: {criterion} - Gain: {best_gain:.2f}")
        f.write(f"Best feature: {best_feature} Using: {criterion}-Gain: {best_gain:.2f}\n")
    f.close()