In [65]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

In [66]:
def mean_squared_error(y_test, y_pred):
    y_test = y_test.to_numpy()
    # y_pred = y_pred.to_numpy()
    # print(y_test)
    # print(y_pred.shape[0])
    result = 0
    n = y_pred.shape[0]
    for i in range(n):
        result += math.pow((y_test[i] - y_pred[i][0]), 2)
    result /= n
    return result

In [67]:
def r2_score(y_test, y_pred):
    y_test = y_test.to_numpy()
    y_test_avg = np.mean(y_test)
    result = 0
    n = y_pred.shape[0]
    numerator = 0
    denominator = 0
    for i in range(n):
        numerator += math.pow((y_test[i] - y_pred[i][0]), 2)
        denominator += math.pow((y_test[i] - y_test_avg), 2)
    result = 1 - (numerator / denominator)
    return result

In [68]:
class Node:
    
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf_node(self):
        return self.value is not None

In [69]:
# Decision Tree Regressor Class
class RegressionTree:
    def __init__(self,max_depth = 15,min_samples_split = 10):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
    
    def fit(self, X, Y):
        self.n_feats = X.shape[1]
        self.col = list(X.columns)
        self.root = self.growTree(X, Y)

    def growTree(self, X, Y, depth = 0):
        df = X.copy()
        df['y'] = Y
        ymean = np.mean(Y)
        self.mse = self.get_mse(Y, ymean)
        n_sample = X.shape[0]
        # stopping criteria
        if depth >= self.max_depth or n_sample <= self.min_samples_split:
            leaf_value = np.mean(Y)
            return Node(value=leaf_value)
        best_feat, best_thresh = self.best_criteria(X, Y)
        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()
        left = self.growTree(left_df.drop('y', axis=1), left_df['y'].values.tolist(), depth+1)
        right = self.growTree(right_df.drop('y', axis=1), right_df['y'].values.tolist(), depth+1)
        return Node(best_feat, best_thresh, left, right)
    
    # find out best criteria
    def best_criteria(self, X, Y):
        df = X.copy()
        df['y'] = Y
        mse_base = self.mse
        best_feature = best_thresh = None
        for feat in X.columns:
            x_mean = self.moving_average(np.unique(df[feat]), 2)
            for value in x_mean:
                left_y = df[df[feat] <= value]['y'].values
                right_y = df[df[feat] > value]['y'].values
                left_mean = right_mean = 0
                if len(left_y) > 0:
                    left_mean = np.mean(left_y)
                if len(right_y) > 0:
                    right_mean = np.mean(right_y)
                
                res_left, res_right = left_y - left_mean, right_y - right_mean
                r = np.concatenate((res_left, res_right), axis=None)
                n = len(r)
                r = np.sum(r**2)
                mse_split = r / n
                if mse_split < mse_base:
                    mse_base = mse_split
                    best_feature = feat
                    best_thresh = value
        return (best_feature, best_thresh)
    
    def get_mse(self, y_true, y_hat):
        n = len(y_true)
        r = np.sum((y_true - y_hat)**2)
        return r / n
    
    def moving_average(self, x, window):
        return np.convolve(x, np.ones(window), 'valid') / window 
    
    def predict(self, X):
        X = X.to_numpy().tolist()
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        fr = node.feature
        index = self.col.index(fr)
        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [70]:
class RandomForest:
    def __init__(self, trees, n_trees, max_feature, prediction_aggrigation_calculation):
        self.n_estimators = n_trees
        self.max_features = max_feature
        self.tree_feature_indexes = []
        self.prediction_aggrigation_calculation = prediction_aggrigation_calculation 
        self.trees = trees

    def _make_random_suset(self, X, y, n_subsets, replasment=True):
        subset = []
        # use 100% of data when replacement is true , use 50% otherwise.
        # sử dụng 100% dữ liệu khi thay thế là đúng, sử dụng 50% nếu không.
        sample_size = (X.shape[0] if replasment else (X.shape[0] // 2))
        
        X = X.to_numpy()
        y = y.to_numpy()
        y = np.reshape(y, (len(y),1))
        # đổi y sang ma trận ...
        # First concadinate the X and y datasets in order to make a choice.
        # Đầu tiên ghép các tập dữ liệu X và y để đưa ra lựa chọn.
        Xy = np.concatenate((X, y), axis=1)
        # print(Xy)
        
        # Đổi các hàng cho nhau
        np.random.shuffle(Xy)
        # Select randome subset of data with replacement.
        # Chọn tập hợp con dữ liệu ngẫu nhiên có thay thế.
        for i in range(n_subsets):
            index = np.random.choice(range(sample_size), size=np.shape(range(sample_size)), replace=replasment)
            X = Xy[index][:, :-1]
            y = Xy[index][: , -1]
            subset.append({"X" : X, "y": y})
        # print(subset)
        return subset

    def train(self, X, y):
        # if the max_features is not given then select it as square root of no on feature availabe.
        # nếu max_features không được cung cấp thì hãy chọn nó làm căn bậc hai của tính năng không có sẵn.
        n_features = X.shape[1]
        name_columns = list(X.columns)
        # print('selected column names: ',name_columns)
        if self.max_features == None:
            self.max_features = int(math.sqrt(n_features))

        # Split the dataset into number of subsets equal to n_estimators.
        # Chia tập dữ liệu thành số tập con bằng n_estimators.
        subsets = self._make_random_suset(X, y, self.n_estimators)
        # print(subsets)

        for i, subset in enumerate(subsets):
            X_subset , y_subset = subset["X"], subset["y"]
            # select a random sucset of features for each tree. This is called feature bagging.
            # chọn một nhóm đặc điểm ngẫu nhiên cho mỗi cây. Điều này được gọi là đóng bao tính năng.
            idx = np.random.choice(range(n_features), size=self.max_features, replace=False)
            # track this for prediction.
            # theo dõi điều này để dự đoán.
            self.tree_feature_indexes.append(idx)
            # Get the X with the selected features only.
            # Chỉ nhận X với các tính năng đã chọn. X_subset= [[1,2,3], [3,4,5]]
            X_subset = X_subset[:, idx]
            selected_name_columns = []
            # print(idx)
            for j in range(len(idx)):
                selected_name_columns.append(name_columns[idx[j]])
            selected_name_columns.append('Y') #cột nhãn
            # change the y_subet to i dimentional array.
            # thay đổi mạng con thành mảng thứ i.
            y_subset = np.expand_dims(y_subset, axis =1)
            # print(y_subset)
            
            # build the model with selected features and selected random subset from dataset.
            # xây dựng mô hình với các tính năng được chọn và tập hợp con ngẫu nhiên được chọn từ tập dữ liệu.
            Xy_subset = np.concatenate((X_subset, y_subset), axis=1)
            Xy_subset = pd.DataFrame(Xy_subset, columns=selected_name_columns)
            print(Xy_subset.head())
            X_set = Xy_subset.drop(columns=['Y'])
            y_set = Xy_subset['Y']
            self.trees[i].fit(X_set, y_set)

    def predict(self, test_X):
        """
        Predict the new samples.

        :param test_X: Depentant variables for prediction.
        Các biến phụ thuộc để dự đoán.
        """
        # predict each sample one by one.
        # dự đoán từng mẫu một.
        y_preds = np.empty((test_X.shape[0], self.n_estimators))
        # print(y_preds)
        # find the prediction from each tree for eeach samples
        # tìm dự đoán từ mỗi cây cho mỗi mẫu
        for i, tree in enumerate(self.trees):
            features_index = self.tree_feature_indexes[i]
            col_name = list(test_X.columns)
            selected_col_name = []
            for j in features_index:
                selected_col_name.append(col_name[j])
            # print(selected_col_name)
            
            X_selected_features = test_X[selected_col_name]
            print(X_selected_features)
            # print(y_preds[:, i])
            y_preds[:, i] = tree.predict(X_selected_features)
            # print(y_preds[:, i])
            
        # find the arrgrecated output.
        # tìm đầu ra được phân bổ.
        y_pred = self.prediction_aggrigation_calculation(y_preds)

        return y_pred

In [71]:
class RandomForestRegression(RandomForest):
    """Rnadom forest for classification task."""
    def __init__(self, max_feature, max_depth, n_trees=100, min_sample_split=10):
        """
        :param max_depth: Int - Max depth of each tree.
        Độ sâu tối đa của mỗi cây.
        
        :param n_trees: Int - Number of trees/estimetors.
        Số cây
        
        :param min_sample_split: Int - minimum samples for a node to have before going for split.
        minimum samples for a node to have before going for split.
        các mẫu tối thiểu để một nút có trước khi chia tách.
        
        :param min_impurity: Int - Min inpurity a node can have.
        """
        self.prediction_aggrigation_calculation = self._mean_calculation
        
        # Initializing the trees.
        # Khởi tạo cây.
        self.trees = []
        for _ in range(n_trees):
            self.trees.append(RegressionTree(min_samples_split=min_sample_split, max_depth=max_depth))

        super().__init__(trees=self.trees, n_trees=n_trees,max_feature=max_feature,
                         prediction_aggrigation_calculation=self.prediction_aggrigation_calculation)
    
    def _mean_calculation(self, y_preds):
        """
        Find mean prediction of all tree prediction for each sampple.
        Tìm dự đoán trung bình của tất cả dự đoán cây cho từng mẫu.

        :param y_preds: Prediction value from number of estimators trees.
        Giá trị dự đoán từ số lượng cây ước tính.
        """
        # create a empty array to store the prediction.
        # tạo một mảng trống để lưu dự đoán.
        y_pred = np.empty((y_preds.shape[0], 1))
        # print(y_pred)
        # iterate over all the data samples.
        # lặp qua tất cả các mẫu dữ liệu.
        for i, sample_predictions in enumerate(y_preds):
            # print(sample_predictions)
            y_pred[i] = np.mean(sample_predictions)
            # print('ok')
            # print(y_pred)

        return y_pred

In [72]:
data = pd.read_table('../IOT/diabetes.tab.txt')

In [73]:
X = data.drop(columns=['Y'])
y = data['Y']
print(X.columns)
print(y)

Index(['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6'], dtype='object')
0      151
1       75
2      141
3      206
4      135
      ... 
437    178
438    104
439    132
440    220
441     57
Name: Y, Length: 442, dtype: int64


In [74]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1) 
# print(X_test)

In [75]:
random_forest_reg = RandomForestRegression(n_trees=10, max_feature=3, min_sample_split=10, max_depth=15)
# Train the model.
random_forest_reg.train(X_train, y_train) 

    BMI    S6    S4      Y
0  25.6  78.0  3.00   31.0
1  25.4  83.0  3.00   25.0
2  26.0  97.0  3.37  198.0
3  23.1  60.0  3.00  125.0
4  21.3  90.0  3.00   42.0
       BP   AGE      S5      Y
0   98.00  35.0  4.2047  200.0
1  103.00  51.0  4.8978  292.0
2   83.00  55.0  4.5218   53.0
3   82.00  50.0  4.0775  136.0
4  123.33  41.0  4.3175  257.0
       S5     S1   AGE      Y
0  4.8122  186.0  79.0  168.0
1  5.3083  186.0  64.0  150.0
2  3.8501  155.0  28.0  116.0
3  4.5850  171.0  46.0  167.0
4  4.2485  255.0  66.0   63.0
      S6    S3     S2      Y
0  109.0  48.0  112.4  274.0
1   88.0  56.0   90.6  121.0
2   86.0  42.0   96.8  225.0
3   82.0  93.0  108.8   87.0
4  103.0  38.0  151.6  192.0
      S1     S6    S4      Y
0  210.0  124.0  6.00  245.0
1  189.0   91.0  3.05   72.0
2  157.0   96.0  6.00  144.0
3  182.0   84.0  3.00  216.0
4  183.0   92.0  2.00   45.0
      S1    S4      BP      Y
0  209.0  5.00   98.00   83.0
1  238.0  4.96  110.67  190.0
2  185.0  5.00   93.00  245.0
3  1

In [76]:
# Predict the values.
y_pred = random_forest_reg.predict(X_test)
print(y_pred)
#Root mean square error.
# score = r2_score(y_test, y_pred)
# print("The r2_score of the trained model", score)


# result = pd.DataFrame({'Actual':y_test, 'Predict':y_pred})

print('MSE: ',mean_squared_error(y_test, y_pred))
print('r2: ', r2_score(y_test, y_pred))

      BMI   S6    S4
246  23.4   77  3.80
425  22.6   79  2.00
293  35.0   91  4.08
31   20.3   81  2.00
359  26.9  106  5.00
..    ...  ...   ...
277  20.9   95  2.00
132  24.4   97  4.00
213  19.8   93  3.00
286  21.3   90  2.00
256  41.3   94  5.00

[89 rows x 3 columns]
         BP  AGE      S5
246   76.67   60  5.1358
425   71.00   27  4.4188
293   98.33   29  4.0431
31    71.00   42  4.2341
359  104.00   59  4.8040
..      ...  ...     ...
277   95.00   39  4.4067
132   92.00   53  4.4998
213   88.00   49  4.3944
286   72.00   38  4.4308
256   81.00   35  4.9488

[89 rows x 3 columns]
         S5   S1  AGE
246  5.1358  247   60
425  4.4188  116   27
293  4.0431  204   29
31   4.2341  161   42
359  4.8040  194   59
..      ...  ...  ...
277  4.4067  150   39
132  4.4998  214   53
213  4.3944  188   49
286  4.4308  165   38
256  4.9488  168   35

[89 rows x 3 columns]
      S6    S3     S2
246   77  65.0  148.0
425   79  56.0   43.4
293   91  50.0  142.6
31    81  66.0   81.2
359  

In [77]:
print(y_test)
print(y_pred)

246     78
425    152
293    200
31      59
359    311
      ... 
277     64
132    107
213     49
286     60
256    346
Name: Y, Length: 89, dtype: int64
[[150.55      ]
 [113.27361111]
 [115.02416667]
 [ 91.31507937]
 [135.32416667]
 [157.89095238]
 [164.92361111]
 [ 92.20698413]
 [186.55555556]
 [123.5574359 ]
 [192.07825397]
 [238.85190476]
 [149.9724359 ]
 [ 98.5168254 ]
 [216.7615873 ]
 [136.97166667]
 [205.01920635]
 [ 99.78611111]
 [174.21079365]
 [198.42873016]
 [157.75333333]
 [107.64678571]
 [114.22960317]
 [121.42460317]
 [131.57138889]
 [136.71444444]
 [ 89.22035714]
 [ 96.00444444]
 [146.01468254]
 [174.52190476]
 [167.7868254 ]
 [138.53507937]
 [173.06968254]
 [ 89.08150794]
 [162.04603175]
 [180.975     ]
 [151.65777778]
 [170.53357143]
 [152.03333333]
 [171.10198413]
 [148.88309524]
 [152.15603175]
 [141.61178571]
 [111.60988095]
 [127.56309524]
 [193.72746032]
 [150.1968254 ]
 [131.40563492]
 [146.00777778]
 [200.96277778]
 [108.48055556]
 [149.77539683]
 [107.3169597

In [78]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [79]:
regressor = RandomForestRegressor(max_depth=15, random_state=0)
regressor.fit(X_train,y_train)

RandomForestRegressor(max_depth=15, random_state=0)

In [80]:
predicted_y = regressor.predict(X_test)
result = pd.DataFrame({'Actual':y_test, 'Predict':predicted_y})
print(result)
print('MSE: ',mean_squared_error(y_test, predicted_y))
print('r2: ', r2_score(y_test, predicted_y))

     Actual     Predict
246      78  131.440000
425     152  104.337143
293     200  165.410000
31       59   73.620882
359     311  165.753333
..      ...         ...
277      64   93.135714
132     107   95.575238
213      49   86.892610
286      60   88.229494
256     346  166.060000

[89 rows x 2 columns]
MSE:  3827.715235726259
r2:  0.28171579175052086
