In [1]:
import pandas as pd
import numpy as np

In [2]:
def gini_impurity(y):
    class_counts = np.bincount(y)
    prob = class_counts / len(y)
    return 1 - np.sum(prob ** 2)

In [3]:
dataset = pd.read_csv('/content/Social_Network_Ads.csv')

In [4]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
dataset['Gender'] = dataset['Gender'].replace({'Male': 0, 'Female': 1})

  dataset['Gender'] = dataset['Gender'].replace({'Male': 0, 'Female': 1})


In [6]:
gender = dataset['Gender']

gini = gini_impurity(gender)
print(f'Gini Impurity: {gini}')

Gini Impurity: 0.4998


In [None]:
purchased = dataset['Purchased']

gini = gini_impurity(purchased)
print(f'Gini Impurity: {gini}')

Gini Impurity: 0.45938750000000006


In [7]:
def assign_age_group(age):
    if 18 <= age <= 30:
        return 1
    elif 31 <= age <= 40:
        return 2
    elif 41 <= age <= 50:
        return 3
    elif 51 <= age <= 60:
        return 4
    else:
        return 0

dataset['AgeGroup'] = dataset['Age'].apply(assign_age_group)

In [11]:
dataset.drop(columns=['Age'], inplace=True)
dataset.head()

Unnamed: 0,User ID,Gender,EstimatedSalary,Purchased,AgeGroup
0,15624510,0,19000,0,1
1,15810944,0,20000,0,2
2,15668575,1,43000,0,1
3,15603246,1,57000,0,1
4,15804002,0,76000,0,1


In [9]:
age = dataset['AgeGroup']

gini = gini_impurity(age)
print(f'Gini Impurity: {gini}')

Gini Impurity: 0.7219375


In [12]:
def assign_estimatedsalary_group(salary):
    if 15000 <= salary <= 45000:
        return 1
    elif 45001 <= salary <= 75000:
        return 2
    elif 75001 <= salary <= 110000:
        return 3
    elif 110001 <= salary <= 150000:
        return 4
    else:
        return 0

dataset['EstimatedSalaryGroup'] = dataset['EstimatedSalary'].apply(assign_estimatedsalary_group)

In [13]:
dataset.drop(columns=['EstimatedSalary'], inplace=True)
dataset.head()

Unnamed: 0,User ID,Gender,Purchased,AgeGroup,EstimatedSalaryGroup
0,15624510,0,0,1,1
1,15810944,0,0,2,1
2,15668575,1,0,1,1
3,15603246,1,0,1,2
4,15804002,0,0,1,3


In [14]:
estimatedsalary = dataset['EstimatedSalaryGroup']

gini = gini_impurity(estimatedsalary)
print(f'Gini Impurity: {gini}')

Gini Impurity: 0.7309125


In [15]:
dataset.drop(columns=['User ID'], inplace=True)
dataset.head()
# We are doing this because we dont need ser Id for predictions hence to avoid any interruptions, we have dropped the column

Unnamed: 0,Gender,Purchased,AgeGroup,EstimatedSalaryGroup
0,0,0,1,1
1,0,0,2,1
2,1,0,1,1
3,1,0,1,2
4,0,0,1,3


In [23]:
def gini_gain(X, y, feature):
    parent_gini = gini_impurity(y)

    unique_values = np.unique(X[feature])

    weighted_gini = 0
    for value in unique_values:
        subset_y = y[X[feature] == value]
        weighted_gini += (len(subset_y) / len(y)) * gini_impurity(subset_y)

    return parent_gini - weighted_gini


In [25]:
def best_split(X, y):
    best_feature = None
    max_gini_gain = -1

    for feature in X.columns:
        gg = gini_gain(X, y, feature)
        if gg > max_gini_gain:
            max_gini_gain = gg
            best_feature = feature

    return best_feature

In [26]:
class DecisionTreeGini:
    def __init__(self, depth=0, max_depth=3):
        self.depth = depth
        self.max_depth = max_depth
        self.feature = None
        self.children = {}
        self.prediction = None

    def fit(self, X, y):
        if len(np.unique(y)) == 1:
            self.prediction = y.iloc[0]
            return

        if self.depth >= self.max_depth:
            self.prediction = y.mode()[0]
            return

        self.feature = best_split(X, y)
        if self.feature is None:
            self.prediction = y.mode()[0]
            return

        for value in np.unique(X[self.feature]):
            subset_X = X[X[self.feature] == value]
            subset_y = y[X[self.feature] == value]

            child = DecisionTreeGini(depth=self.depth + 1, max_depth=self.max_depth)
            child.fit(subset_X.drop(columns=[self.feature]), subset_y)
            self.children[value] = child

    def predict(self, x):
        if self.prediction is not None:
            return self.prediction

        feature_value = x[self.feature]
        if feature_value in self.children:
            return self.children[feature_value].predict(x)
        else:
            return 0


X = dataset.drop(columns=['Purchased'])
y = dataset['Purchased']

tree = DecisionTreeGini()
tree.fit(X, y)


In [32]:
test_sample = {'Gender': 0, 'AgeGroup': 1, 'EstimatedSalaryGroup': 1}
prediction = tree.predict(test_sample)
print("Prediction:", prediction)

Prediction: 0


Our decision tree is predicting well as given a test sample, it provided with correct result