In [16]:
import pandas as pd

In [17]:
df = pd.read_csv("gini_data.csv", delimiter=";")

In [18]:
df.head()

Unnamed: 0,Horsepower,Seats,Color,Sold
0,485,2,silver,fast
1,485,2,silver,fast
2,485,2,white,fast
3,180,4,black,fast
4,485,4,white,fast


In [19]:
def geani_impurity(df, feature, target, num_borders = [] ):
    
    if num_borders:
        feature_categories = num_borders
    else:
        feature_categories = pd.unique(df[feature])
    target_categories = pd.unique(df[target])
    total_values = len(df)
    result_df = pd.DataFrame()
    cols = []
    
    for label in feature_categories:
        if num_borders:
            low, high = label
            label = f"{low} <= {feature} < {high}"
            sub_df = df[df[feature] >= low]
            sub_df = sub_df[sub_df[feature] < high]
        else:
            sub_df = df[df[feature] == label]
        label_count = len(sub_df)

        # calculate percentages for each target variable
        percentages = {}
        for target_value in target_categories:
            percentage = len(sub_df[sub_df[target] == target_value]) / label_count
            percentages[target_value] = percentage
        
        # calculate the impurity for this target variable
        impurity = 1
        for value in percentages.values():
            impurity -= value**2
        impurity_weighted = impurity * label_count / total_values
        
        # construct the dataframe
        cols = ["FeatureVal"] + list(percentages.keys()) + ["1-p^2..", "WeightedImpurity"]
        data = [label] + list(percentages.values()) + [impurity, impurity_weighted]
        
        this_df = pd.DataFrame( data=    [data],
                                columns= cols)
        result_df = result_df.append(this_df)
        
    total = ["TOTAL:"] + ["" for x in percentages] + ["", result_df["WeightedImpurity"].sum()]
    total = pd.DataFrame(data=[total], columns=cols)
    result_df = result_df.append(total)
    return result_df


In [20]:
bins = [ (0,3) , (3,4.5) , (4.5,100)]
result = geani_impurity(df, feature="Seats", target="Sold", num_borders=bins)

In [21]:
result.head()

Unnamed: 0,FeatureVal,fast,shelf-warmer,normal,1-p^2..,WeightedImpurity
0,0 <= Seats < 3,1.0,0.0,0.0,0.0,0.0
0,3 <= Seats < 4.5,0.666667,0.333333,0.0,0.444444,0.066667
0,4.5 <= Seats < 100,0.214286,0.357143,0.428571,0.642857,0.45
0,TOTAL:,,,,,0.516667


In [22]:
result = geani_impurity(df, "Color", "Sold")

In [23]:
result.head(10)

Unnamed: 0,FeatureVal,fast,shelf-warmer,normal,1-p^2..,WeightedImpurity
0,silver,0.5,0.25,0.25,0.625,0.125
0,white,0.333333,0.333333,0.333333,0.666667,0.2
0,black,0.666667,0.333333,0.0,0.444444,0.133333
0,red,0.0,0.25,0.75,0.375,0.075
0,TOTAL:,,,,,0.533333


In [24]:
# Calculate Gini Impurity of Color after Seats split for the node Seats ≥ 4.5
split_df = df[df["Seats"]> 4.5]
result = geani_impurity(split_df, feature="Color", target="Sold")
result.head()

Unnamed: 0,FeatureVal,fast,shelf-warmer,normal,1-p^2..,WeightedImpurity
0,black,0.75,0.25,0.0,0.375,0.107143
0,red,0.0,0.25,0.75,0.375,0.107143
0,silver,0.0,0.5,0.5,0.5,0.071429
0,white,0.0,0.5,0.5,0.5,0.142857
0,TOTAL:,,,,,0.428571
