In [1]:
import numpy as np
from numpy.typing import NDArray

In [2]:
class Node:
    def __init__(self,value: np.float64,feature: int,answer: int = -1):   #<value
        self.value=value
        self.feature=feature
        self.answer=answer
        self.left=None
        self.right=None
        

In [3]:
class DecisionTree:
    def __init__ (self,depth:int):
        self.depth=depth
        self.node = Node(np.nan,-1,-1)

    def GiniCriteria(self, labels:NDArray[np.int32]):
        uniqueValues, counts = np.unique(labels, return_counts=True)
        return 1-np.sum((counts/labels.shape[0])**2)
    
    def lossFunction(self, fPartData:NDArray[np.float64],sPartData:NDArray[np.float64]):
        numberOfExamples = fPartData.shape[0]+sPartData.shape[0]
        return fPartData.shape[0]/numberOfExamples*self.GiniCriteria(fPartData[:,-1])+\
                    sPartData.shape[0]/numberOfExamples*self.GiniCriteria(sPartData[:,-1])

    def __fitRecursion(self,node:Node,data: NDArray[np.float64],depth):
        if depth == 0 or np.all(data[:,-1] == data[0,-1]):
            labels, counts = np.unique(data[:,-1], return_counts=True) 
            node.value = np.nan
            node.feature = -1
            node.answer = labels[np.argmax(counts)]
            return
            
        first=True
        minLoss=np.inf
        bestValue=0.0
        bestFeature=0
        for feature in range(data.shape[1]-1):
            fragmentationValues = np.linspace(np.min(data[:,feature]),np.max(data[:,feature]) , num=50, dtype=np.float64)
            for value in fragmentationValues:
                fPartData = data[data[:,feature]<value]
                sPartData = data[data[:,feature]>=value]
                loss=self.lossFunction(fPartData,sPartData)
                if first:
                    minLoss=loss
                    first=False
                    bestValue,bestFeature=value,feature
                elif loss<minLoss:
                    minLoss = loss
                    bestValue,bestFeature=value,feature

        node.value = bestValue
        node.feature = bestFeature
        node.answer = np.nan
        node.left = Node(np.nan, -1, -1)
        node.right = Node(np.nan, -1, -1)
        self.__fitRecursion(node.left,data[data[:,node.feature]<node.value],depth-1)
        self.__fitRecursion(node.right,data[data[:,node.feature]>=node.value],depth-1)
    
    def fit(self,data: NDArray[np.float64]):
       self.__fitRecursion(self.node,data,self.depth)


    def predict(self,X:NDArray[np.float64])->int:
        temp = self.node.answer
        tempNode=self.node
        while np.isnan(temp):
            if X[tempNode.feature]<tempNode.value:
               tempNode = tempNode.left
            else:
               tempNode = tempNode.right
            temp = tempNode.answer
        return temp
            

In [4]:
from sklearn.datasets import load_iris
import pandas as pd

In [5]:
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
#df['species'] = pd.Categorical.from_codes(y, iris.target_names)
df['species'] = y

print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  


In [6]:
df.describe()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [7]:
df = df.drop(['sepal length (cm)','sepal width (cm)'], axis=1)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['species'])

In [10]:
dt = DecisionTree(3)

In [11]:
print(train_df)
print((train_df.to_numpy()))

     petal length (cm)  petal width (cm)  species
8                  1.4               0.2        0
106                4.5               1.7        2
76                 4.8               1.4        1
9                  1.5               0.1        0
89                 4.0               1.3        1
..                 ...               ...      ...
37                 1.4               0.1        0
2                  1.3               0.2        0
33                 1.4               0.2        0
52                 4.9               1.5        1
3                  1.5               0.2        0

[120 rows x 3 columns]
[[1.4 0.2 0. ]
 [4.5 1.7 2. ]
 [4.8 1.4 1. ]
 [1.5 0.1 0. ]
 [4.  1.3 1. ]
 [5.  1.9 2. ]
 [4.2 1.3 1. ]
 [5.1 1.5 2. ]
 [6.1 2.3 2. ]
 [6.7 2.2 2. ]
 [6.6 2.1 2. ]
 [4.5 1.5 1. ]
 [3.5 1.  1. ]
 [4.1 1.  1. ]
 [4.  1.2 1. ]
 [1.6 0.2 0. ]
 [1.5 0.3 0. ]
 [5.8 1.8 2. ]
 [5.5 1.8 2. ]
 [1.5 0.4 0. ]
 [4.9 1.5 1. ]
 [1.6 0.2 0. ]
 [6.4 2.  2. ]
 [1.6 0.2 0. ]
 [3.7 1.  1. ]
 

In [12]:
dt.fit(train_df.to_numpy())

In [13]:
val,counts=np.unique((test_df.to_numpy())[:,-1], return_counts=True)
print(counts)

[10 10 10]


In [14]:
y_pred=np.array([dt.predict(X) for X in (test_df.to_numpy())[:,:2]])

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy = accuracy_score((test_df.to_numpy())[:,-1], y_pred)

In [49]:
accuracy

0.9666666666666667