In [47]:
import random
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt

In [59]:
class Node:

    def __init__(self, x, y, indexes, min_leaf=7):
        self.x = x
        self.y = y
        self.indexes = indexes
        self.min_leaf = min_leaf
        self.row_count = len(indexes)
        self.col_count = x.shape[1]
        self.val = np.mean(y[indexes])
        self.score = float('inf')
        self.find_varsplit()

    def find_varsplit(self):
        
        for c in range(self.col_count):
            self.find_better_split(c)
            
        if self.is_leaf:
            return
        
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.indexes[lhs], self.min_leaf)
        self.rhs = Node(self.x, self.y, self.indexes[rhs], self.min_leaf)

    def find_better_split(self, var_idx):

        x = self.x.values[self.indexes, var_idx]

        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score:
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]

    def find_score(self, lhs, rhs):
        y = self.y[self.indexes]
        lhs_std = y[lhs].std()
        rhs_std = y[rhs].std()
        return lhs_std * lhs.sum() + rhs_std * rhs.sum()

    @property
    def split_col(self):
        return self.x.values[self.indexes, self.var_idx]

    @property
    def is_leaf(self):
        return self.score == float('inf')

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)

In [60]:
class DecisionTreeRegressor:
  
  def fit(self, X, y, min_leaf = 5):
    self.dtree = Node(X, y, np.array(np.arange(len(y))), min_leaf)
    return self

  def predict(self, X):
    return self.dtree.predict(X.values)

In [61]:
df_train = pd.read_csv("./Datasets/q3/train.csv")
df_test = pd.read_csv("./Datasets/q3/test.csv")
print(df_train.shape, df_test.shape)
print(df_train.columns, df_test.columns)

train, test, validate = np.split(df_train.sample(frac=1),[int(.75*len(df_train)), int(1*len(df_train))])

(1000, 81) (460, 80)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'Gar

In [62]:
x_train = df_train[['LotArea', 'MSZoning', 'LotFrontage', 'LandContour', 'Utilities', 'BldgType',
           'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
           'Foundation', '1stFlrSF', '2ndFlrSF', 'GarageArea', 'WoodDeckSF', 'MoSold',
           'YrSold']]
y_train = df_train['SalePrice']

x_train = pd.concat([x_train, pd.get_dummies(x_train['MSZoning'], prefix='MSZoning')], axis=1)
x_train.drop(['MSZoning'], axis=1, inplace=True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['LandContour'], prefix='LandContour')], axis=1)
x_train.drop(['LandContour'], axis=1, inplace=True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['Utilities'], prefix='Utilities')], axis=1)
x_train.drop(['Utilities'], axis=1, inplace=True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['BldgType'], prefix='BldgType')], axis=1)
x_train.drop(['BldgType'], axis=1, inplace=True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['HouseStyle'], prefix='HouseStyle')], axis=1)
x_train.drop(['HouseStyle'], axis=1, inplace=True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['Foundation'], prefix='Foundation')], axis=1)
x_train.drop(['Foundation'], axis=1, inplace=True)

In [63]:
x_train.dtypes

LotArea                int64
LotFrontage          float64
OverallQual            int64
OverallCond            int64
YearBuilt              int64
YearRemodAdd           int64
MasVnrArea           float64
1stFlrSF               int64
2ndFlrSF               int64
GarageArea             int64
WoodDeckSF             int64
MoSold                 int64
YrSold                 int64
MSZoning_C (all)       uint8
MSZoning_FV            uint8
MSZoning_RH            uint8
MSZoning_RL            uint8
MSZoning_RM            uint8
LandContour_Bnk        uint8
LandContour_HLS        uint8
LandContour_Low        uint8
LandContour_Lvl        uint8
Utilities_AllPub       uint8
Utilities_NoSeWa       uint8
BldgType_1Fam          uint8
BldgType_2fmCon        uint8
BldgType_Duplex        uint8
BldgType_Twnhs         uint8
BldgType_TwnhsE        uint8
HouseStyle_1.5Fin      uint8
HouseStyle_1.5Unf      uint8
HouseStyle_1Story      uint8
HouseStyle_2.5Fin      uint8
HouseStyle_2.5Unf      uint8
HouseStyle_2St

In [64]:
regressor = DecisionTreeRegressor().fit(x_train, y_train)



In [50]:
x_test = test[['LotArea', 'MSZoning', 'LotFrontage', 'LandContour', 'Utilities', 'BldgType',
           'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
           'Foundation', '1stFlrSF', '2ndFlrSF', 'GarageArea', 'WoodDeckSF', 'MoSold',
           'YrSold']]
y_test = test['SalePrice']

x_test = pd.concat([x_test, pd.get_dummies(x_test['MSZoning'], prefix='MSZoning')], axis=1)
x_test.drop(['MSZoning'], axis=1, inplace=True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['LandContour'], prefix='LandContour')], axis=1)
x_test.drop(['LandContour'], axis=1, inplace=True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['Utilities'], prefix='Utilities')], axis=1)
x_test.drop(['Utilities'], axis=1, inplace=True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['BldgType'], prefix='BldgType')], axis=1)
x_test.drop(['BldgType'], axis=1, inplace=True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['HouseStyle'], prefix='HouseStyle')], axis=1)
x_test.drop(['HouseStyle'], axis=1, inplace=True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['Foundation'], prefix='Foundation')], axis=1)
x_test.drop(['Foundation'], axis=1, inplace=True)

predictions = regressor.predict(x_test)

In [None]:
metrics.r2_score(y_test, predictions)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

sqrt(mean_squared_error(y_test, predictions))