In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('abalone.txt', names=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'y'], sep='\t')
df.head(5)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,y
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

X.shape, Y.shape

((4177, 8), (4177,))

In [121]:
trainSize = round(X.shape[0] * 0.7)

XTrain = X[:trainSize]
XDev = X[trainSize:]

YTrain = Y[:trainSize]
YDev = Y[trainSize:]

XTrain.shape, XDev.shape

((2924, 8), (1253, 8))

In [129]:
class CartTree():
    
    def __init__(self, tolS=1, tolN=4):
        self._tolS = tolS
        self._tolN = tolN
    
    def binSplitDataSet(self, X, Y, feature, value):
        ltBools = X[:, feature] > value
        leftX = X[ltBools]
        leftY = Y[ltBools]
        
        stBools = X[:, feature] <= value
        rightX = X[stBools]
        rightY = Y[stBools]
        
        return leftX, leftY, rightX, rightY
    
    def regLeaf(self, Y):
        return np.mean(Y)
    
    def regErr(self, Y):
        m = Y.shape[0]
        return np.var(Y) * m
    
    def chooseBestSplit(self, X, Y, tolS=1, tolN=4):
        if len(set(Y)) == 1:
            return None, self.regLeaf(Y)
        
        m, n = X.shape
        S = self.regErr(Y)
        
        bestS = float('inf')
        bestIndex = 0
        bestValue = 0
        
        for featIndex in range(n):
            for splitVal in set(X[:, featIndex]):
                leftX, leftY, rightX, rightY = self.binSplitDataSet(X, Y, featIndex, splitVal)
                
                if leftX.shape[0] < tolS or rightX.shape[0] < tolS:
                    continue
                    
                newS = self.regErr(leftY) + self.regErr(rightY)
                if newS < bestS:
                    bestIndex = featIndex
                    bestValue = splitVal
                    bestS = newS
                    bestLeftX, bestLeftY, bestRightX, bestRightY = (leftX, leftY, rightX, rightY)
                    
            if (S - bestS) < tolS:
                return None, self.regLeaf(Y)
            
            if bestLeftX.shape[0] < tolS or bestRightX.shape[0] < tolS:
                return None, self.regLeaf(Y)
            
        return bestIndex, bestValue
    
    def createTree(self, X, Y, tolS=1, tolN=4):
        feat, val = self.chooseBestSplit(X, Y, tolS, tolN)
                
        if feat is None:
            return val
        
        regTree = {}
        regTree['spIndex'] = feat
        regTree['spValue'] = val
        
        leftX, leftY, rightX, rightY = self.binSplitDataSet(X, Y, feat, val)
        
        regTree['left'] = self.createTree(leftX, leftY, tolS, tolN)
        regTree['right'] = self.createTree(rightX, rightY, tolS, tolN)
        
        return regTree
    
    def isTree(self, obj):
        return isinstance(obj, dict)
    
    def getMean(self, tree):
        if self.isTree(tree['right']): 
            tree['right'] = self.getMean(tree['right'])
        
        if self.isTree(tree['left']):
            tree['left'] = self.getMean(tree['left'])
            
        return (tree['left'] + tree['right']) / 2.0
    
    def prune(self, tree, devX, devY):        
        if devX.shape[0] == 0:
            return self.getMean(tree)
        
        if self.isTree(tree['right']) or self.isTree(tree['left']):
            leftX, leftY, rightX, rightY = self.binSplitDataSet(devX, devY, tree['spIndex'], tree['spValue'])
            
            if self.isTree(tree['left']):
                tree['left'] = self.prune(tree['left'], leftX, leftY)
        
            if self.isTree(tree['right']):
                tree['right'] = self.prune(tree['right'], rightX, rightY)
        
        if not self.isTree(tree['left']) and not self.isTree(tree['right']):
            leftX, leftY, rightX, rightY = self.binSplitDataSet(devX, devY, tree['spIndex'], tree['spValue'])
            errorNoMerge = np.sum((leftY - tree['left']) ** 2) + np.sum((rightY - tree['right']) ** 2)
            treeMean = (tree['left'] + tree['right']) / 2.0
            errorMerge = np.sum((devY - treeMean) ** 2)
            
            if errorMerge < errorNoMerge:
                return treeMean
            else:
                return tree
            
        else:
            return tree
    
    def predictWithTree(self, tree, x):
        if not isinstance(tree, dict):
            return tree
        
        spIndex = tree['spIndex']
        spValue = tree['spValue']
        
        if x[spIndex] > spValue:
            return self.predictWithTree(tree['left'], x)
        else:
            return self.predictWithTree(tree['right'], x)
    
    def fit(self, X, Y, devX, devY):
        self.tree = self.createTree(X, Y, self._tolS, self._tolN)
        self.tree = self.prune(self.tree, devX, devY)
        
    def predict(self, X):
        Y = []
        
        for x in X:
            y = self.predictWithTree(self.tree, x)
            Y.append(y)
            
        return np.array(Y)
    
    def r2Score(self, Y, Y_pre):
        return 1 - (np.sum(np.square(Y - Y_pre)) / np.sum(np.square(Y - np.mean(Y))))

In [130]:
regTree = CartTree(tolS=0.1, tolN=1)
regTree.fit(XTrain, YTrain, XDev, YDev)

In [131]:
Y_pre = regTree.predict(XDev)

In [132]:
regTree.r2Score(YDev, Y_pre)

0.25538520832252876