In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
from sklearn.datasets import make_regression
x,y = make_regression(n_features=2,n_samples=100,random_state=2, noise=5)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [97]:
def variance(y):
        y_mean = y.mean()
        return np.mean((y-y_mean)**2)
def varRed(S,y1,y2,y):
    return S - np.sum([variance(y1)*(len(y1)/len(y)),variance(y2)*(len(y2)/len(y))])
class TreeNode:
    def __init__(self,val=None,left=None,right=None,fea=None,thres=None) -> None:
        self.value = val
        self.left = left
        self.right = right
        self.fea = fea
        self.thres = thres
class DTree:
    def __init__(self,maxDepth):
        self.root = None
        self.max = maxDepth

    def fit(self,x,y):
        self.root = self._growTree(x,y,self.max,0)
    def _growTree(self,x,y,maxdp,depth):
        if(len(y)==0):
            return None
        if(depth==maxdp or len(set(y))==1 or len(y)<5):
            return TreeNode(np.mean(y))
        if(variance(y)==0):
            return TreeNode(np.mean(y))
        xo = x.T
        mids = np.array(list(map(lambda z:
           (z[1:]+z[:-1])/2
            ,xo)))
        for row in mids:
            row[row == np.max(row)] -= 1e-6
            row[row == np.min(row)] +=1e-6
        mid = [[(row_idx, val) for val in row] for row_idx, row in enumerate(mids)]
        info = np.array(list(map(lambda z:
             np.array(list(map(lambda k:
                varRed(S=variance(y),y1=y[x[:,k[0]]<=k[1]],y2=y[x[:,k[0]]>k[1]],y=y) 
                ,z)))
            ,mid)))
        fea = info.max(axis=1).argmax()
        thresIndex = info.argmax(axis=1)[fea]
        thres = mids[int(fea)][int(thresIndex)]
        if(thres==np.array(mids[int(fea)]).max()):
            thres= thres -1e-6
        if(thres==np.array(mids[int(fea)]).min()):
            thres+=1e-6
        leftchild = self._growTree(x[x[:,fea]<=thres],y[x[:,fea]<=thres],maxdp,depth+1)
        rightchild = self._growTree(x[x[:,fea]>thres],y[x[:,fea]>thres],maxdp,depth+1)
        return TreeNode(None,leftchild,rightchild,fea,thres)
        
    def _leftchildValue(x,y,thres,fea):
        x1 = np.where(x[fea]<thres,x)
        y1 = y[np.all(x==x1,axis=1)]
        return x1,y1
    def _rightchildValue(x,y,thres,fea):
        x2 = np.where(x[fea]<thres,x)
        y2 = y[np.all(x==x2,axis=1)]
        return x2,y2
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.fea] <= node.thres:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

In [98]:
n_samples = len(x_train)
r = np.random.choice(np.arange(n_samples),size=(1,n_samples))

In [99]:
dt = DTree(2)
dt.fit(x_train[r[0]],y_train[r[0]])
y_pred = dt.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_pred,y_test)

0.6871917934659522

In [100]:
class RandomForest:
    def __init__(self,n,maxdp):
        self.n = n
        self.maxdp = maxdp
    def fit(self,x_train,y_train):
        n_samples = len(x_train)
        r = np.random.choice(np.arange(n_samples),size=(self.n,n_samples))
        self.dt = []
        for i in range(self.n):
            Dtree = DTree(maxDepth=self.maxdp)
            Dtree.fit(x_train[r[i]],y_train[r[i]])
            self.dt.append(Dtree)
    def predict(self,x_test):
        self.y_pred = []
        for i in range(self.n):
            self.y_pred.append(self.dt[i].predict(x_test))
        return np.array(self.y_pred).mean(axis=0)

In [109]:
rf = RandomForest(n=20,maxdp=7)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [110]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error
print("R2-Score: ",r2_score(y_pred,y_test))
print("MAPE: ",100-mean_absolute_percentage_error(y_pred,y_test))

R2-Score:  0.8971386161598611
MAPE:  99.65128624184769


In [112]:
from sklearn.ensemble import RandomForestRegressor
rfg = RandomForestRegressor(n_estimators=20,max_depth=7,random_state=2,n_jobs=-1,oob_score=True)
rfg.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("OOB score: ",rfg.oob_score_)
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"Mean Squared Error: {100-mean_absolute_percentage_error(y_test, y_pred)}")

OOB score:  0.9217440351532646
R^2 Score: 0.850422412188348
Mean Squared Error: 99.6039658144269


In [117]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfg = RandomForestRegressor(random_state=2,n_jobs=-1)
params = {
    'max_depth':[2,3,4,7,9,11],
    'n_estimators':[5,10,15,20],
    'oob_score':[True],
}
rfg = GridSearchCV(rfg,param_grid=params,n_jobs=-1)
rfg.fit(x_train,y_train)
y_pred = rfg.best_estimator_.predict(x_test)
print("Best params: ",rfg.best_params_)
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"Mean Squared Error: {100-mean_absolute_percentage_error(y_test, y_pred)}")
print("OOB score: ",rfg.best_estimator_.oob_score_)


Best params:  {'max_depth': 7, 'n_estimators': 20, 'oob_score': True}
R^2 Score: 0.9060122501341771
Mean Squared Error: 99.66539895027516
OOB score:  0.9217440351532646
