In [1]:
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data = {
    "Student": [1,2,3,4,5,6,7,8,9,10],
    "Prior Experience": ["Yes","No","Yes","No","Yes","No","Yes","Yes","Yes","Yes"],
    "Course": ["Programming","Programming","History","Programming","English",
               "Programming","Programming","Mathematics","Programming","Programming"],
    "Time": ["Day","Day","Night","Night","Day","Day","Day","Night","Night","Night"],
    "Liked": ["Yes","No","No","Yes","Yes","No","No","Yes","Yes","No"]
}

df = pd.DataFrame(data)

In [3]:
def gini_impurity(groups, classes):
    total_samples = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = list(group["Liked"]).count(class_val) / size
            score += p * p
        gini += (1 - score) * (size / total_samples)
    return gini

In [5]:
# Get possible splits for a categorical column
def get_splits(df, column):
    return df[column].unique()

In [6]:
#  Evaluate each split and find the best
def best_split(df):
    classes = df["Liked"].unique()
    best_gini = 1.0
    best_column = None
    best_value = None
    
    for column in df.columns:
        if column == "Liked" or column == "Student":
            continue
        for value in get_splits(df, column):
            left = df[df[column] == value]
            right = df[df[column] != value]
            gini = gini_impurity([left, right], classes)
            # Check if this is the best split
            if gini < best_gini:
                best_gini = gini
                best_column = column
                best_value = value
                
    return best_column, best_value, best_gini

In [7]:
root_column, root_value, root_gini = best_split(df)
print(f"Root Node Column: {root_column}")
print(f"Split Value: {root_value}")
print(f"Gini Impurity of this split: {root_gini:.3f}")

Root Node Column: Course
Split Value: History
Gini Impurity of this split: 0.444
