In [1]:
import numpy as np
import pandas as pd
import math

# from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.tree import DecisionTreeRegressor

In [122]:
# def mean_squared_error(y_test, y_pred):
#     y_test = y_test.to_numpy()
#     # y_pred = y_pred.to_numpy()
#     # print(y_test)
#     # print(y_pred.shape[0])
#     result = 0
#     for i in range(y_pred.shape[0]):
#         print(y_test[i])
#         print(y_pred[i][0])
#         print('---------')
#         result += 
#     return result

In [2]:
class Node:
    
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf_node(self):
        return self.value is not None

In [3]:
# Decision Tree Regressor Class
class RegressionTree:
    def __init__(self,max_depth = 15,min_samples_split = 10):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
    
    def fit(self, X, Y):
        self.n_feats = X.shape[1]
        self.col = list(X.columns)
        self.root = self.growTree(X, Y)

    def growTree(self, X, Y, depth = 0):
        df = X.copy()
        df['y'] = Y
        ymean = np.mean(Y)
        self.mse = self.get_mse(Y, ymean)
        n_sample = X.shape[0]
        # stopping criteria
        if depth >= self.max_depth or n_sample <= self.min_samples_split:
            leaf_value = np.mean(Y)
            return Node(value=leaf_value)
        best_feat, best_thresh = self.best_criteria(X, Y)
        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()
        left = self.growTree(left_df.drop('y', axis=1), left_df['y'].values.tolist(), depth+1)
        right = self.growTree(right_df.drop('y', axis=1), right_df['y'].values.tolist(), depth+1)
        return Node(best_feat, best_thresh, left, right)
    
    # find out best criteria
    def best_criteria(self, X, Y):
        df = X.copy()
        df['y'] = Y
        mse_base = self.mse
        best_feature = best_thresh = None
        for feat in X.columns:
            x_mean = self.moving_average(np.unique(df[feat]), 2)
            for value in x_mean:
                left_y = df[df[feat] <= value]['y'].values
                right_y = df[df[feat] > value]['y'].values
                left_mean = right_mean = 0
                if len(left_y) > 0:
                    left_mean = np.mean(left_y)
                if len(right_y) > 0:
                    right_mean = np.mean(right_y)
                
                res_left, res_right = left_y - left_mean, right_y - right_mean
                r = np.concatenate((res_left, res_right), axis=None)
                n = len(r)
                r = np.sum(r**2)
                mse_split = r / n
                if mse_split < mse_base:
                    mse_base = mse_split
                    best_feature = feat
                    best_thresh = value
        return (best_feature, best_thresh)
    
    def get_mse(self, y_true, y_hat):
        n = len(y_true)
        r = np.sum((y_true - y_hat)**2)
        return r / n
    
    def moving_average(self, x, window):
        return np.convolve(x, np.ones(window), 'valid') / window 
    
    def predict(self, X):
        X = X.to_numpy().tolist()
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        fr = node.feature
        index = self.col.index(fr)
        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [4]:
class RandomForest:
    def __init__(self, trees, n_trees, max_feature, prediction_aggrigation_calculation):
        self.n_estimators = n_trees
        self.max_features = max_feature
        self.tree_feature_indexes = []
        self.prediction_aggrigation_calculation = prediction_aggrigation_calculation 
        self.trees = trees

    def _make_random_suset(self, X, y, n_subsets, replasment=True):
        subset = []
        # use 100% of data when replacement is true , use 50% otherwise.
        # sử dụng 100% dữ liệu khi thay thế là đúng, sử dụng 50% nếu không.
        sample_size = (X.shape[0] if replasment else (X.shape[0] // 2))
        
        X = X.to_numpy()
        y = y.to_numpy()
        y = np.reshape(y, (len(y),1))
        # đổi y sang ma trận ...
        # First concadinate the X and y datasets in order to make a choice.
        # Đầu tiên ghép các tập dữ liệu X và y để đưa ra lựa chọn.
        Xy = np.concatenate((X, y), axis=1)
        # print(Xy)
        
        # Đổi các hàng cho nhau
        np.random.shuffle(Xy)
        # Select randome subset of data with replacement.
        # Chọn tập hợp con dữ liệu ngẫu nhiên có thay thế.
        for i in range(n_subsets):
            index = np.random.choice(range(sample_size), size=np.shape(range(sample_size)), replace=replasment)
            X = Xy[index][:, :-1]
            y = Xy[index][: , -1]
            subset.append({"X" : X, "y": y})
        # print(subset)
        return subset

    def train(self, X, y):
        # if the max_features is not given then select it as square root of no on feature availabe.
        # nếu max_features không được cung cấp thì hãy chọn nó làm căn bậc hai của tính năng không có sẵn.
        n_features = X.shape[1]
        name_columns = list(X.columns)
        # print('selected column names: ',name_columns)
        if self.max_features == None:
            self.max_features = int(math.sqrt(n_features))

        # Split the dataset into number of subsets equal to n_estimators.
        # Chia tập dữ liệu thành số tập con bằng n_estimators.
        subsets = self._make_random_suset(X, y, self.n_estimators)
        # print(subsets)

        for i, subset in enumerate(subsets):
            X_subset , y_subset = subset["X"], subset["y"]
            # select a random sucset of features for each tree. This is called feature bagging.
            # chọn một nhóm đặc điểm ngẫu nhiên cho mỗi cây. Điều này được gọi là đóng bao tính năng.
            idx = np.random.choice(range(n_features), size=self.max_features, replace=False)
            # track this for prediction.
            # theo dõi điều này để dự đoán.
            self.tree_feature_indexes.append(idx)
            # Get the X with the selected features only.
            # Chỉ nhận X với các tính năng đã chọn. X_subset= [[1,2,3], [3,4,5]]
            X_subset = X_subset[:, idx]
            selected_name_columns = []
            # print(idx)
            for j in range(len(idx)):
                selected_name_columns.append(name_columns[idx[j]])
            selected_name_columns.append('Y') #cột nhãn
            # change the y_subet to i dimentional array.
            # thay đổi mạng con thành mảng thứ i.
            y_subset = np.expand_dims(y_subset, axis =1)
            # print(y_subset)
            
            # build the model with selected features and selected random subset from dataset.
            # xây dựng mô hình với các tính năng được chọn và tập hợp con ngẫu nhiên được chọn từ tập dữ liệu.
            Xy_subset = np.concatenate((X_subset, y_subset), axis=1)
            Xy_subset = pd.DataFrame(Xy_subset, columns=selected_name_columns)
            print(Xy_subset.head())
            X_set = Xy_subset.drop(columns=['Y'])
            y_set = Xy_subset['Y']
            self.trees[i].fit(X_set, y_set)

    def predict(self, test_X):
        """
        Predict the new samples.

        :param test_X: Depentant variables for prediction.
        Các biến phụ thuộc để dự đoán.
        """
        # predict each sample one by one.
        # dự đoán từng mẫu một.
        y_preds = np.empty((test_X.shape[0], self.n_estimators))
        # print(y_preds)
        # find the prediction from each tree for eeach samples
        # tìm dự đoán từ mỗi cây cho mỗi mẫu
        for i, tree in enumerate(self.trees):
            features_index = self.tree_feature_indexes[i]
            col_name = list(test_X.columns)
            selected_col_name = []
            for j in features_index:
                selected_col_name.append(col_name[j])
            # print(selected_col_name)
            
            X_selected_features = test_X[selected_col_name]
            print(X_selected_features)
            # print(y_preds[:, i])
            y_preds[:, i] = tree.predict(X_selected_features)
            # print(y_preds[:, i])
            
        # find the arrgrecated output.
        # tìm đầu ra được phân bổ.
        y_pred = self.prediction_aggrigation_calculation(y_preds)

        return y_pred

In [5]:
class RandomForestRegression(RandomForest):
    """Rnadom forest for classification task."""
    def __init__(self, max_feature, max_depth, n_trees=100, min_sample_split=10):
        """
        :param max_depth: Int - Max depth of each tree.
        Độ sâu tối đa của mỗi cây.
        
        :param n_trees: Int - Number of trees/estimetors.
        Số cây
        
        :param min_sample_split: Int - minimum samples for a node to have before going for split.
        minimum samples for a node to have before going for split.
        các mẫu tối thiểu để một nút có trước khi chia tách.
        
        :param min_impurity: Int - Min inpurity a node can have.
        """
        self.prediction_aggrigation_calculation = self._mean_calculation
        
        # Initializing the trees.
        # Khởi tạo cây.
        self.trees = []
        for _ in range(n_trees):
            self.trees.append(RegressionTree(min_samples_split=min_sample_split, max_depth=max_depth))

        super().__init__(trees=self.trees, n_trees=n_trees,max_feature=max_feature,
                         prediction_aggrigation_calculation=self.prediction_aggrigation_calculation)
    
    def _mean_calculation(self, y_preds):
        """
        Find mean prediction of all tree prediction for each sampple.
        Tìm dự đoán trung bình của tất cả dự đoán cây cho từng mẫu.

        :param y_preds: Prediction value from number of estimators trees.
        Giá trị dự đoán từ số lượng cây ước tính.
        """
        # create a empty array to store the prediction.
        # tạo một mảng trống để lưu dự đoán.
        y_pred = np.empty((y_preds.shape[0], 1))
        # print(y_pred)
        # iterate over all the data samples.
        # lặp qua tất cả các mẫu dữ liệu.
        for i, sample_predictions in enumerate(y_preds):
            # print(sample_predictions)
            y_pred[i] = np.mean(sample_predictions)
            # print('ok')
            # print(y_pred)

        return y_pred

In [6]:
data = pd.read_table('D:\\Backup Data from Drive C\\Desktop\\Nhập môn KHDL\\Machine Learning\\IOT\\diabetes.tab.txt')

In [7]:
X = data.drop(columns=['Y'])
y = data['Y']
print(X.columns)
print(y)

Index(['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6'], dtype='object')
0      151
1       75
2      141
3      206
4      135
      ... 
437    178
438    104
439    132
440    220
441     57
Name: Y, Length: 442, dtype: int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1) 
# print(X_test)

In [10]:
random_forest_reg = RandomForestRegression(n_trees=10, max_feature=3, min_sample_split=10, max_depth=15)
# Train the model.
random_forest_reg.train(X_train, y_train) 

   SEX    S6      S5      Y
0  1.0  94.0  4.7449  154.0
1  2.0  99.0  4.9628  164.0
2  1.0  60.0  3.8918   65.0
3  1.0  73.0  4.2905  118.0
4  2.0  78.0  4.2047   77.0
      S6   AGE  SEX      Y
0   90.0  34.0  2.0  181.0
1  108.0  34.0  2.0   42.0
2   80.0  51.0  2.0   91.0
3   88.0  37.0  2.0  142.0
4   82.0  67.0  1.0   90.0
      S2   BMI  SEX      Y
0  103.2  21.6  1.0   75.0
1   83.8  20.9  1.0   86.0
2  122.4  26.6  2.0  245.0
3   91.8  18.9  1.0   72.0
4  101.6  26.5  1.0  258.0
      S6      BP   AGE      Y
0   93.0   84.00  37.0  128.0
1   94.0   94.67  35.0   58.0
2   92.0   87.00  67.0  127.0
3  100.0  110.00  48.0   65.0
4   99.0   97.00  53.0   49.0
      S1      BP      S5      Y
0  218.0   91.33  4.9053  259.0
1  156.0   82.00  3.9890  134.0
2  253.0  123.00  5.4250  252.0
3  194.0  108.00  5.3471  246.0
4  161.0  101.00  4.2047   44.0
      S1   AGE     S6      Y
0  162.0  25.0   87.0   49.0
1  182.0  32.0   89.0  129.0
2  180.0  29.0   88.0  310.0
3  261.0  51.0   93.

In [11]:
# Predict the values.
y_pred = random_forest_reg.predict(X_test)
print(y_pred)
#Root mean square error.
# score = r2_score(y_test, y_pred)
# print("The r2_score of the trained model", score)


# result = pd.DataFrame({'Actual':y_test, 'Predict':y_pred})

print('MSE: ',mean_squared_error(y_test, y_pred))
# print('r2: ', r2_score(y_test, y_pred))

     SEX   S6      S5
246    1   77  5.1358
425    1   79  4.4188
293    1   91  4.0431
31     1   81  4.2341
359    2  106  4.8040
..   ...  ...     ...
277    1   95  4.4067
132    2   97  4.4998
213    1   93  4.3944
286    1   90  4.4308
256    1   94  4.9488

[89 rows x 3 columns]
      S6  AGE  SEX
246   77   60    1
425   79   27    1
293   91   29    1
31    81   42    1
359  106   59    2
..   ...  ...  ...
277   95   39    1
132   97   53    2
213   93   49    1
286   90   38    1
256   94   35    1

[89 rows x 3 columns]
        S2   BMI  SEX
246  148.0  23.4    1
425   43.4  22.6    1
293  142.6  35.0    1
31    81.2  20.3    1
359  126.6  26.9    2
..     ...   ...  ...
277   65.6  20.9    1
132  146.0  24.4    2
213  114.8  19.8    1
286   60.2  21.3    1
256  102.8  41.3    1

[89 rows x 3 columns]
      S6      BP  AGE
246   77   76.67   60
425   79   71.00   27
293   91   98.33   29
31    81   71.00   42
359  106  104.00   59
..   ...     ...  ...
277   95   95.00   39

In [12]:
print(y_test)
print(y_pred)

246     78
425    152
293    200
31      59
359    311
      ... 
277     64
132    107
213     49
286     60
256    346
Name: Y, Length: 89, dtype: int64
[[138.60208333]
 [114.31333333]
 [120.62896825]
 [ 86.99777778]
 [195.16277778]
 [167.6631746 ]
 [195.29444444]
 [160.55166667]
 [191.22670635]
 [118.02690476]
 [177.88277778]
 [138.14638889]
 [111.12583333]
 [102.92968254]
 [174.24111111]
 [145.58027778]
 [175.18055556]
 [ 89.06468254]
 [156.385     ]
 [162.00666667]
 [179.36892857]
 [100.87690476]
 [124.43607143]
 [126.2727381 ]
 [121.48420635]
 [198.99555556]
 [135.10079365]
 [154.555     ]
 [166.91833333]
 [167.90678571]
 [153.08559524]
 [164.69934524]
 [143.75230159]
 [104.92337302]
 [162.19134921]
 [197.26305556]
 [ 95.28388889]
 [190.41194444]
 [154.37055556]
 [150.29440476]
 [161.46170635]
 [167.23083333]
 [105.13194444]
 [123.73083333]
 [153.41777778]
 [235.27492063]
 [123.43055556]
 [133.40214286]
 [151.4625    ]
 [156.77083333]
 [102.71722222]
 [159.95305556]
 [129.3918650

In [13]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
regressor = RandomForestRegressor(max_depth=15, random_state=0)
regressor.fit(X_train,y_train)

In [15]:
predicted_y = regressor.predict(X_test)
result = pd.DataFrame({'Actual':y_test, 'Predict':predicted_y})
print(result)
print('MSE: ',mean_squared_error(y_test, predicted_y))
print('r2: ', r2_score(y_test, predicted_y))

     Actual     Predict
246      78  131.440000
425     152  104.337143
293     200  165.410000
31       59   73.620882
359     311  165.753333
..      ...         ...
277      64   93.135714
132     107   95.575238
213      49   86.892610
286      60   88.229494
256     346  166.060000

[89 rows x 2 columns]
MSE:  3827.715235726259
r2:  0.28171579175052086
