In [70]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split

In [2]:
class Node:
    
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf_node(self):
        return self.value is not None

In [105]:
# Decision Tree Regressor Class
class RegressionTree:
    def __init__(self,n_feats = None,max_depth = 15,min_samples_split = 10):
        self.root = None
        self.n_feats = n_feats
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
    
    def fit(self, X, Y):
        # self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.n_feats = X.shape[1]
        print('N_feats: ',self.n_feats)
        self.col = list(X.columns)
        print('Columns: ',X.columns)
        self.root = self.growTree(X, Y)

    def growTree(self, X, Y, depth = 0):
        df = X.copy()
        df['y'] = Y
        ymean = np.mean(Y)
        self.mse = self.get_mse(Y, ymean)
        n_sample, _ = X.shape
        # stopping criteria
        if (depth >= self.max_depth or n_sample <= self.min_samples_split):
            # print('Leaf node: depth and n_sample: ',depth,' ',n_sample)
            leaf_value = np.mean(Y)
            return Node(value=leaf_value)
        # print('Not leaf node: depth and n_sample: ',depth,' ',n_sample)
        feats_idxs = list(X.columns)
        # print('feats_idxs: ',feats_idxs)
        # best_feat, best_thresh = self.best_criteria(X, Y, feats_idxs)
        best_feat, best_thresh = self.best_criteria(X, Y)
        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()
        # print('---start----')
        # print('best_feat: ',best_feat,' best_thresh: ',best_thresh,' len(left_df): ',len(left_df),' len(right_df): ',len(right_df))
        # print('----end-----')
        left = self.growTree(left_df.drop('y', axis=1), left_df['y'].values.tolist(), depth+1)
        right = self.growTree(right_df.drop('y', axis=1), right_df['y'].values.tolist(), depth+1)
        return Node(best_feat, best_thresh, left, right)
    
    # find out best criteria
    def best_criteria(self, X, Y):
        df = X.copy()
        df['y'] = Y
        mse_base = self.mse
        best_feature = None
        best_thresh = None
        for feat in X.columns:
            xdf = df
            x_mean = self.moving_average(np.unique(xdf[feat]), 2)
            
            for value in x_mean:
                left_y = xdf[xdf[feat] <= value]['y'].values
                right_y = xdf[xdf[feat] > value]['y'].values
                left_mean = 0
                right_mean = 0
                if len(left_y) > 0:
                    left_mean = np.mean(left_y)
                if len(right_y) > 0:
                    right_mean = np.mean(right_y)
                
                res_left = left_y - left_mean
                res_right = right_y - right_mean

                r = np.concatenate((res_left, res_right), axis=None)
                n = len(r)
                r = r ** 2
                r = np.sum(r)
                mse_split = r / n
                if mse_split < mse_base:
                    mse_base = mse_split
                    best_feature = feat
                    best_thresh = value
        # print('mse_base: ',mse_base)
        return (best_feature, best_thresh)
    
    def get_mse(self, y_true, y_hat):
        n = len(y_true)
        r = y_true - y_hat
        r = r ** 2
        r = np.sum(r)
        return r / n
    
    def moving_average(self, x, window):
        return np.convolve(x, np.ones(window), 'valid') / window 
    
    def predict(self, X):
        X = X.to_numpy().tolist()
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        fr = node.feature
        index = self.col.index(fr)
        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [106]:
data = pd.read_csv('city_day.csv')

In [107]:
data = data.dropna()

In [6]:
X = data.drop(columns=['Y'])
y = data['Y']

In [108]:
X = data.drop(columns=['City','Date','AQI_Bucket'])


In [110]:
X.shape

(6236, 13)

In [121]:
def train_test_split(features, label_name, test_size, random_state):
    shuffle_feature_df = features.sample(frac=random_state,replace=True)
    test_size = int(test_size*len(features))
    X_train = shuffle_feature_df[test_size:]
    X_test = shuffle_feature_df[:test_size]
    y_train = X_train[label_name]
    y_test = X_test[label_name]
    X_train = X_train.drop(columns=[label_name])
    X_test = X_test.drop(columns=[label_name])
    return X_train, X_test, y_train, y_test

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X,'AQI',test_size=0.95,random_state=1)

In [124]:
X_train.shape

(312, 12)

In [125]:
print(y_test.dtypes)

float64


In [126]:
DRT = RegressionTree(max_depth = 15,min_samples_split = 20)
DRT.fit(X_train, y_train) 
y_pred = DRT.predict(X_test)
# 11m23.9s

N_feats:  12
Columns:  Index(['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'Xylene'],
      dtype='object')
mse_base:  2670.076663614164
mse_base:  1040.6567119155354
mse_base:  328.88742630748607
mse_base:  126.64467285587976
mse_base:  87.55375874125875
mse_base:  75.11525600835945
mse_base:  48.18062397372742
mse_base:  40.59206349206349
mse_base:  275.2623745191857
mse_base:  229.328146374829
mse_base:  154.8466223698782
mse_base:  115.75261324041813
mse_base:  82.66097560975611
mse_base:  65.89891774891774
mse_base:  47.81622678396872
mse_base:  41.08555399719495
mse_base:  17.22946859903381
mse_base:  166.6130681818182
mse_base:  121.37714285714287
mse_base:  91.21776315789474
mse_base:  827.6110702163335
mse_base:  450.9570614035088
mse_base:  331.6245144110276
mse_base:  338.30263157894734
mse_base:  196.32371794871796
mse_base:  152.82244071717753
mse_base:  589.1999999999999
mse_base:  2015.7789037811924
mse_base:  781.0090301003

In [127]:
y_pred = DRT.predict(X_test)

>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>


In [117]:
finalData_1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
finalData_1.head()

Unnamed: 0,Actual,Predicted
10557,376.0,352.736842
14982,149.0,127.142857
11403,224.0,277.0625
29049,81.0,90.083333
28912,236.0,277.0625


In [118]:
n = len(y_test)
mse = y_test - y_pred
mse = mse ** 2
mse = np.sum(mse)
mse = mse / n
print('Mean Squared Error :', mse)

Mean Squared Error : 978.4887572186694


In [100]:
from sklearn.tree import DecisionTreeRegressor

In [119]:
DTR = DecisionTreeRegressor(max_depth = 15,min_samples_split = 20)
DTR.fit(X_train, y_train)

In [120]:
y_pred_2 = DTR.predict(X_test)
finalData_2 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_2})
finalData_2.head()

Unnamed: 0,Actual,Predicted
10557,376.0,352.736842
14982,149.0,127.142857
11403,224.0,277.0625
29049,81.0,90.083333
28912,236.0,277.0625
