### Implement the ClassyfingTree for two-dimensional dataframe of shape (n, 2), where n is the number of observation.

In [189]:
import numpy as np

class DecisionTree():


    def __init__(self, n:int, df:np.array):
        self.n = n
        self.df = df
        self.attributes_values = {}



    
    def GiniCoeff(self,df:np.array) ->float:
        """The method calculates the Gini coefficient for the (left or right) split of the dataframe"""
        count_unique_classes = np.unique(df[:, 1], 
        return_counts = True)[1]


        sum_of_squared_probab = np.sum((count_unique_classes/df.shape[0])**2)

        return 1- sum_of_squared_probab

    def LeftSplit(self,df:np.array, a:float) ->np.array:
        return df[df[:, 0] <= a, :]

    def RightSplit(self, df:np.array, a:float) ->np.array:
        return df[df[:, 0] > a, :]
        


    def ComputeGeneralGiniCoeff(self,df:np.array, a:float) -> float:
        """This method computes the  General Gini Impurity Coefficient for the whole split"""
        #First, calculate the Gini Coefficient for the left splitted dataframe.
        df_left = self.LeftSplit(df, a)
        n_left = df_left.shape[0]

        #Then, calculate the Gini Coefficient for the right splitted dataframe.
        df_right = self.RightSplit(df, a)
        n_right = df_right.shape[0]

        Gini_left = self.GiniCoeff(df_left)
        Gini_right = self.GiniCoeff(df_right)


        return (n_left * Gini_left + n_right * Gini_right)/(n_left + n_right)

   
    

    def Train(self,df:np.array, loc: tuple[int] = (1, 1)):

        if not (df.shape[0] <= self.n or len(np.unique(df[:, 0])) == 1):
            attribute_value = ()


            #The goal of the following "for loop" is to find the value of a, which minimizes the GiniImpurity Coefficient
            for a in np.unique(df[:,0]):
                gini = self.ComputeGeneralGiniCoeff(df, a)

                if attribute_value == () or attribute_value[1] > gini:
                    attribute_value = (a, gini)


            self.attributes_values[loc] = attribute_value[0]
                    
        
            self.Train(self.LeftSplit(df, attribute_value[0]), (loc[0]+1, 2*loc[1]-1))
            self.Train(self.RightSplit(df, attribute_value[0]), (loc[0]+1, 2*loc[1]))

    




    def Classify(self,df, record:int, loc: tuple[int] = (1,1)):
        if (df.shape[0]<=self.n or len(np.unique(df[:, 0])) == 1):
            if df.shape[0]:
                return np.mean(df[:, 1])

            else:
                return np.nan
        
        else:

            if  record <= self.attributes_values[loc]:
                return self.Classify(self.LeftSplit(df, self.attributes_values[loc]), record, loc = (loc[0]+1, 2*loc[1]-1))
            else:
                return self.Classify(self.RightSplit(df, self.attributes_values[loc]), record, loc = (loc[0]+1, 2*loc[1]))

            









41.0
{(1, 1): 10, (2, 1): 1, (3, 1): 0, (3, 2): 6, (4, 3): 2, (5, 6): 3, (6, 12): 5, (7, 23): 4, (4, 4): 9, (5, 7): 7, (2, 2): 76, (3, 3): 52, (4, 5): 50, (5, 9): 32, (6, 17): 26, (7, 33): 24, (8, 65): 22, (9, 129): 19, (10, 257): 11, (11, 514): 16, (12, 1027): 15, (13, 2053): 12, (14, 4106): 13, (15, 8212): 14, (12, 1028): 17, (13, 2056): 18, (10, 258): 20, (9, 130): 23, (8, 66): 25, (7, 34): 29, (8, 67): 27, (9, 134): 28, (8, 68): 30, (9, 136): 31, (6, 18): 37, (7, 35): 34, (8, 69): 33, (8, 70): 36, (9, 139): 35, (7, 36): 48, (8, 71): 47, (9, 141): 42, (10, 281): 40, (11, 561): 38, (12, 1122): 39, (11, 562): 41, (10, 282): 45, (11, 563): 43, (12, 1126): 44, (11, 564): 46, (8, 72): 49, (5, 10): 51, (4, 6): 56, (5, 11): 54, (6, 21): 53, (6, 22): 55, (5, 12): 63, (6, 23): 60, (7, 45): 57, (8, 90): 58, (9, 180): 59, (7, 46): 61, (8, 92): 62, (6, 24): 65, (7, 47): 64, (7, 48): 74, (8, 95): 73, (9, 189): 66, (10, 378): 68, (11, 755): 67, (11, 756): 72, (12, 1511): 69, (13, 3022): 70, (14, 