In [8]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv("./fish.csv")
df

Unnamed: 0,length,weight,type
0,8.618381,7.848030,tuna
1,2.522046,8.762507,tuna
2,2.035690,7.101971,tuna
3,6.695008,4.953932,salmon
4,9.834069,7.908453,tuna
...,...,...,...
995,3.710186,0.956612,salmon
996,3.749596,9.557450,salmon
997,2.410087,3.355781,tuna
998,0.990773,3.812210,tuna


In [25]:
df["type"].unique()
# There are 2 classes, tuna or salmon

array(['tuna', 'salmon'], dtype=object)

In [26]:
score_dict = {"tuna":0, "salmon":0} # Just a sample of how to keep track of leaves

In [27]:
def gini_impurity(score_dict):
    """
    This function takes input a dictionary in the form ${"tuna":0, "salmon":0}$
    and will return the impurity of the leaf
    """
    gini_tuna, gini_salmon = 0,0
    
    tuna_score = score_dict["tuna"]
    salmon_score = score_dict["salmon"]

    tuna_prob = tuna_score / (tuna_score + salmon_score)
    salmon_prob = salmon_score / (tuna_score + salmon_score)

    gini_impurity = 1 - (tuna_prob)**2 - (salmon_prob)**2
    return gini_impurity

In [28]:
def calc_total_gini(leaf1,leaf2):
    """
    This function takes input 2 dictionary in the form ${"tuna":0, "salmon":0}$
    and will return the total gini impurity of the leaf based on calculation from both
    """

    Gini_impurity_1 = gini_impurity(leaf1)
    Gini_impurity_2 = gini_impurity(leaf2)
    total = sum(leaf1.values()) + sum(leaf2.values()) 
    # Calulate leaf impurity
    Leaf1_impurity = sum(leaf1.values()) / total
    Leaf2_impurity = sum(leaf2.values()) / total

    Total_gini = ((Gini_impurity_1 * Leaf1_impurity) + (Gini_impurity_2 * Leaf2_impurity))
    return Total_gini
    

In [29]:
def make_score(column, thresold):
    """
    This function creates the score_dictionary used in several functions
    The inputs are columns and thresolds
    The scores are calculated from a column based on each value
    """
    score_dict_greater = {"tuna":0, "salmon":0}
    score_dict_smaller = {"tuna":0, "salmon":0}
    for i in range(0,len(df)):
        if df[column][i] < thresold:
            score_dict_smaller[df["type"][i]] += 1
        elif df[column][i] > thresold:
            score_dict_greater[df["type"][i]] += 1 
    return score_dict_greater, score_dict_smaller
    

In [48]:
def Best_Thresold(row_name, df):
    """
    - Takes the row name as input, I have done this since both rows are numerical data
    - Sorts the array and finds the midpoint between 2 adjacent values to figure out the possible thresolds
    - Then it iterates through those thresolds
    - While iteration:
        The algorithm creates 2 leaves
        Calculates the total gini impurity for them
        Keeps track of the lowest gini impurity and the Best thresold
    """
    row = np.array(df[row_name], dtype=np.double)
    row = np.sort(row)
    thresolds = []
    midpoint = 0
    # Making thresolds
    for i in range(1, len(row)):
        midpoint = (row[i] + row[i-1]) / 2
        thresolds.append(midpoint)
    # Making thresolds
    TotalGini = 99999999 # We want to minimise the gini impurity
    BestThresold = 0
    
    for i in range(0,len(thresolds)):
        Thresold = thresolds[i]
        Column = row_name
        Leaf_True, Leaf_False = make_score(Column, Thresold)
        CalculatedGini = calc_total_gini(Leaf_True, Leaf_False)
        if CalculatedGini < TotalGini:
            TotalGini = CalculatedGini
            BestThresold = Thresold # Keeping track of thresold which gives the least gini impurity

    return BestThresold, TotalGini
Best_Thresold("length",df)

(2.996014794, 0.3405252525252525)

In [62]:
def split(dff):
    """
    Finds the best split for each column
    """
    key_columns = ["length", "weight"] 
    for column in key_columns:
        BestThresold, GiniImpurity = Best_Thresold(column,dff)
        print("Impurity: " ,GiniImpurity, "For ", column, "Best split is", BestThresold)
    
split(df)

Impurity:  0.3405252525252525 For  length Best split is 2.996014794
Impurity:  0.3234977144744514 For  weight Best split is 3.989097837


In [63]:
# Okay the best column to split is weight < 2.996014794

In [64]:
make_score("weight",3.989097837 )

({'tuna': 483, 'salmon': 86}, {'tuna': 125, 'salmon': 306})

In [65]:
# Weight > 3.989097837 ?????
# True: {'tuna': 513, 'salmon': 153 
# False: {'tuna': 95, 'salmon': 239}

In [66]:
# Split the true part using the optimal length
df_False = df[df['weight'] < 3.989097837]
# Keep all values where weight is smaller than 3.989097837...
df_True = df[df['weight'] > 3.989097837]
# Keep all values where weight is greater than 3.989097837...


In [67]:
# Now we have 2 sets where we want to find the optimal split 

![Image1](./image/Image1.PNG)  

In [68]:
split(df_True)

Impurity:  0.3405252525252525 For  length Best split is 2.9966957835
Impurity:  0.3244577464788732 For  weight Best split is 3.9989821555000002


In [69]:
Best_Thresold("length", df_True)

(2.9966957835, 0.3405252525252525)