In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
from sklearn.datasets import make_regression
x,y = make_regression(n_features=2,n_samples=100,random_state=2, noise=5)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [58]:
def variance(y):
        y_mean = y.mean()
        return np.mean((y-y_mean)**2)
def varRed(S,y1,y2,y):
    return S - np.sum([variance(y1)*(len(y1)/len(y)),variance(y2)*(len(y2)/len(y))])
class TreeNode:
    def __init__(self,val=None,left=None,right=None,fea=None,thres=None) -> None:
        self.value = val
        self.left = left
        self.right = right
        self.fea = fea
        self.thres = thres
class DTree:
    def __init__(self,maxDepth):
        self.root = None
        self.max = maxDepth

    def fit(self,x,y):
        self.root = self._growTree(x,y,self.max,0)
    def _growTree(self,x,y,max,depth):
        if(len(y)==0):
            return None
        if(depth==max or len(set(y))==1):
            return TreeNode(np.mean(y))
        if(variance(y)==0):
            return TreeNode(np.mean(y))
        xo = x.T
        x1 = [[(row_idx, val) for val in row] for row_idx, row in enumerate(xo)]
        #2nd method to find info gain by splitting at each value instead of bins
        #but bins are better because splitting at each value consumes lot of computational resource and can cause overfitting
        #info = np.array(list(map(lambda z:
            #np.array(list(map(lambda k:
                #varRed(S=variance(y),y1=y[x[:,k[0]]<=k[1]],y2=y[x[:,k[0]]>k[1]],y=y) 
                #,z))),x1)))
        mids = np.array(list(map(lambda z:
           (z[1:]+z[:-1])/2
            ,xo)))
        mid = [[(row_idx, val) for val in row] for row_idx, row in enumerate(mids)]
        info = np.array(list(map(lambda z:
             np.array(list(map(lambda k:
                varRed(S=variance(y),y1=y[x[:,k[0]]<=k[1]],y2=y[x[:,k[0]]>k[1]],y=y) 
                ,z)))
            ,mid)))
        #print(info.max(axis=1))
        fea = info.max(axis=1).argmax()
        thresIndex = info.argmax(axis=1)[fea]
        thres = mids[int(fea)][int(thresIndex)]
        
        leftchild = self._growTree(x[x[:,fea]<=thres],y[x[:,fea]<=thres],max,depth+1)
        rightchild = self._growTree(x[x[:,fea]>thres],y[x[:,fea]>thres],max,depth+1)
        return TreeNode(None,leftchild,rightchild,fea,thres)
        
    def _leftchildValue(x,y,thres,fea):
        x1 = np.where(x[fea]<thres,x)
        y1 = y[np.all(x==x1,axis=1)]
        return x1,y1
    def _rightchildValue(x,y,thres,fea):
        x2 = np.where(x[fea]<thres,x)
        y2 = y[np.all(x==x2,axis=1)]
        return x2,y2
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.fea] <= node.thres:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

In [59]:
dt = DTree(maxDepth=7)
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)

In [60]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error
print("R2-Score: ",r2_score(y_pred,y_test))
print("MAPE: ",100-mean_absolute_percentage_error(y_pred,y_test))

R2-Score:  0.7768602573103143
MAPE:  99.37439262648137


In [61]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
dtg = DecisionTreeRegressor(random_state=2)
params = {
    'max_depth':[2,3,5,7,9]
}
dtg = GridSearchCV(dtg, param_grid=params,cv=5, n_jobs=-1)
dtg.fit(x_train,y_train)
y_pred = dtg.best_estimator_.predict(x_test)
print("Best params: ",dtg.best_params_)
print("R2-Score: ",r2_score(y_pred,y_test))
print("MAPE: ",100-mean_absolute_percentage_error(y_pred,y_test))

Best params:  {'max_depth': 7}
R2-Score:  0.6730378805791968
MAPE:  99.37726751746726
