In [1]:
import numpy as np
import pandas as pd

In [2]:
class Node:
    
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf_node(self):
        return self.value is not None

In [3]:
# Decision Tree Regressor Class
class RegressionTree:
    def __init__(self,max_depth = 15,min_samples_split = 10):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
    
    def fit(self, X, Y):
        self.n_feats = X.shape[1]
        self.col = list(X.columns)
        self.root = self.growTree(X, Y)

    def growTree(self, X, Y, depth = 0):
        df = X.copy()
        df['y'] = Y
        ymean = np.mean(Y)
        self.mse = self.get_mse(Y, ymean)
        n_sample = X.shape[0]
        # stopping criteria
        if depth >= self.max_depth or n_sample <= self.min_samples_split:
            leaf_value = np.mean(Y)
            return Node(value=leaf_value)
        best_feat, best_thresh = self.best_criteria(X, Y)
        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()
        left = self.growTree(left_df.drop('y', axis=1), left_df['y'].values.tolist(), depth+1)
        right = self.growTree(right_df.drop('y', axis=1), right_df['y'].values.tolist(), depth+1)
        return Node(best_feat, best_thresh, left, right)
    
    # find out best criteria
    def best_criteria(self, X, Y):
        df = X.copy()
        df['y'] = Y
        mse_base = self.mse
        best_feature = best_thresh = None
        for feat in X.columns:
            x_mean = self.moving_average(np.unique(df[feat]), 2)
            for value in x_mean:
                left_y = df[df[feat] <= value]['y'].values
                right_y = df[df[feat] > value]['y'].values
                left_mean = right_mean = 0
                if len(left_y) > 0:
                    left_mean = np.mean(left_y)
                if len(right_y) > 0:
                    right_mean = np.mean(right_y)
                
                res_left, res_right = left_y - left_mean, right_y - right_mean
                r = np.concatenate((res_left, res_right), axis=None)
                n = len(r)
                r = np.sum(r**2)
                mse_split = r / n
                if mse_split < mse_base:
                    mse_base = mse_split
                    best_feature = feat
                    best_thresh = value
        return (best_feature, best_thresh)
    
    def get_mse(self, y_true, y_hat):
        n = len(y_true)
        r = np.sum((y_true - y_hat)**2)
        return r / n
    
    def moving_average(self, x, window):
        return np.convolve(x, np.ones(window), 'valid') / window 
    
    def predict(self, X):
        X = X.to_numpy().tolist()
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        fr = node.feature
        index = self.col.index(fr)
        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [4]:
import mysql.connector as mysql
from mysql.connector import Error
import sqlite3

In [5]:
# lấy dữ liệu sau xử lý

try:
    connection = mysql.connect(host='localhost',database='air_quality',user='root',password='cuong#Super2001')
    if connection.is_connected():
        cursor = connection.cursor()
        sql = "select * from preprocessed_air_quality"
        cursor.execute(sql)
        record = cursor.fetchall()
        print(len(record))
        connection.commit()
        cursor.close()
except Error as e:
    print('Error',e)

10960


In [6]:
# Đưa dữ liệu về dạng dataframe
df = pd.DataFrame(record,columns=['ID','PM2.5','PM10','NO','NO2','NOx','NH3','CO','O3','Toluene','AQI'])
# df = pd.read_csv('IOT/city_day.csv')
df = df.drop(columns=['ID'])

In [9]:
def train_test_split(features, label_name, test_size, random_state):
    shuffle_feature_df = features.sample(frac = 1,random_state=random_state)
    test_size = int(test_size*len(features))
    X_train = shuffle_feature_df[test_size:]
    X_test = shuffle_feature_df[:test_size]
    y_train = X_train[label_name]
    y_test = X_test[label_name]
    X_train = X_train.drop(columns=[label_name])
    X_test = X_test.drop(columns=[label_name])
    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df,'AQI',test_size=0.25,random_state=1)

In [14]:
X_train.shape

(8220, 9)

In [15]:
DRT = RegressionTree(max_depth = 10,min_samples_split = 100)
DRT.fit(X_train, y_train) 
# y_pred = DRT.predict(X_test)
# 5m0.7s

In [16]:
y_pred = DRT.predict(X_test)
finalData_1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
finalData_1.head()


Unnamed: 0,Actual,Predicted
2492,120.0,117.271605
619,68.0,23.80597
9921,116.0,117.271605
9844,78.0,86.283951
8496,78.0,94.636364


In [17]:
n = len(y_test)
mse = y_test - y_pred
mse = mse ** 2
mse = np.sum(mse)
mse = mse / n
print('Mean Squared Error :', mse)

Mean Squared Error : 771.261521293409


In [18]:
from sklearn.tree import DecisionTreeRegressor

In [19]:
DTR = DecisionTreeRegressor(max_depth = 10,min_samples_split = 100)
DTR.fit(X_train, y_train)

In [20]:
y_pred_2 = DTR.predict(X_test)
finalData_2 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_2})
finalData_2.head()

Unnamed: 0,Actual,Predicted
2492,120.0,117.271605
619,68.0,23.80597
9921,116.0,117.271605
9844,78.0,86.283951
8496,78.0,94.636364
