# Курсовой проект

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

In [2]:
train = pd.read_csv('./train.csv', index_col='Id')
test =  pd.read_csv('./test.csv', index_col='Id')

TARGET = 'mean_exam_points'

In [3]:
X = train.drop(columns=[TARGET])
y = train[TARGET]

## Обзор данных

In [4]:
X.describe()

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,45.878,1.9868,1699.105,1.7195,0.375,0.1329,0.1096,0.0537,0.0321,0.0194
std,8.043929,1.772213,524.886654,0.792264,0.484147,0.339484,0.312406,0.225436,0.176274,0.137933
min,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,46.0,2.0,1500.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
max,68.0,10.0,3950.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
train.corr()[TARGET]

age                   -0.007646
years_of_experience    0.205417
lesson_price           0.721179
qualification          0.755963
physics                0.187726
chemistry              0.017825
biology                0.023022
english                0.013174
geography              0.014401
history               -0.000113
mean_exam_points       1.000000
Name: mean_exam_points, dtype: float64

## Создание алгоритма случайного леса

In [31]:
# R2 score
def r2score(y_true, y_pred):
    ym = y_true.mean()
    tot = np.sum( (y_true - ym) ** 2 )
    reg = np.sum( (y_true - y_pred) ** 2 )
    return 1-reg/tot


def print_score(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_train)
    train_score = r2score(y_train, y_pred)
    
    y_pred = model.predict(X_test)
    test_score = r2score(y_test, y_pred)

    print(f'Train Score {train_score:0.4f}\nTest Score {test_score:0.4f}')
    return test_score


class Node:
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  
        self.t = t  
        self.true_branch = true_branch 
        self.false_branch = false_branch 
        
class Leaf:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.prediction = np.mean(self.y) 


# Decision Tree Regressor
class DTR:
    def __init__(self, min_samples_leaf=5, max_depth=None, max_features=None):
        self.min_samples_leaf = min_samples_leaf
        self.max_depth = max_depth
        self.max_features = max_features
        
    def dispersion(self, values):
        m = values.mean()
        return np.mean((values - m) ** 2)
    
    def quality(self, left_values, right_values, current_dispersion):
        p = float(left_values.shape[0]) / (left_values.shape[0] + right_values.shape[0])
        return current_dispersion - p * self.dispersion(left_values) - (1 - p) * self.dispersion(right_values)
    
    def split(self, x, y, index, t):
        left = np.where(x[:, index] <= t)
        right = np.where(x[:, index] > t)

        true_data = x[left]
        false_data = x[right]
        true_y = y[left]
        false_y = y[right]
        
        return true_data, false_data, true_y, false_y
    
    def get_features(self):
        return range(self.n_features)
        
    def find_best_split(self, x, y):
        current_dispersion = self.dispersion(y)

        best_t = None
        best_index = None
        best_quality = 0

        for index in self.get_features():            
            t_values = x[:, index]
            
            for t in t_values:
                true_data, false_data, true_y, false_y = self.split(x, y, index, t)

                if len(true_data) < self.min_samples_leaf or len(false_data) < self.min_samples_leaf:
                    continue

                current_quality = self.quality(true_y, false_y, current_dispersion)
                if current_quality > best_quality:
                    best_quality, best_t, best_index = current_quality, t, index

        return best_quality, best_t, best_index    
  
    def build_tree(self, x, y, depth=0):
        quality, t, index = self.find_best_split(x, y)
        if quality == 0 or (self.max_depth and depth > self.max_depth):
            return Leaf(x, y)

        true_data, false_data, true_y, false_y = self.split(x, y, index, t)
        true_branch = self.build_tree(true_data, true_y, depth+1)
        false_branch = self.build_tree(false_data, false_y, depth+1)

        return Node(index, t, true_branch, false_branch)

    def print_tree(self):
        def print_node(node, spacing=""):
            if isinstance(node, Leaf):
                print(spacing + "Прогноз:", node.prediction)
                return

            print(spacing + 'Индекс', str(node.index))
            print(spacing + 'Порог', str(node.t))

            print (spacing + '--> True:')
            print_node(node.true_branch, spacing + "  ")

            print (spacing + '--> False:')
            print_node(node.false_branch, spacing + "  ")
        
        print_node(self.tree_)
    
    def fit(self, X, y):
        if self.max_features and self.max_features < X.shape[1]:
            self.n_features = self.max_features
        else:
            self.n_features = X.shape[1]

        self.tree_ = self.build_tree(X, y)
        return self

    def predict(self, X):
        def predict_object(obj, node):
            if isinstance(node, Leaf):
                return node.prediction
            if obj[node.index] <= node.t:
                return predict_object(obj, node.true_branch)
            else:
                return predict_object(obj, node.false_branch)

        y = []
        for obj in X:
            prediction = predict_object(obj, self.tree_)
            y.append(prediction)
        return y


# Random Forest Regressor
class RFR(DTR):
    
    def __init__(self, n_trees=1, min_samples_leaf=5, max_depth=None, max_features=None, random_state=42):
        super().__init__(min_samples_leaf, max_depth, max_features)
        self.n_trees = n_trees
        self.random = np.random.RandomState(random_state)

    def get_bootstrap(self, X, y):
        bootstrap = []
        n_samples = X.shape[0]
        for i in range(self.n_trees):
            bX = np.zeros(X.shape)
            by = np.zeros(y.shape)
            for j in range(n_samples):
                ind = self.random.randint(0, n_samples)
                bX[ind] = X[ind]
                by[ind] = y[ind]
            bootstrap.append((bX, by))
        return bootstrap
    
    def get_features(self):
        sample_indexes = [i for i in range(self.n_features)]
        len_subsample = self.n_features // 3
        subsample = []
        self.random.shuffle(sample_indexes)
        for _ in range(len_subsample):
            subsample.append(sample_indexes.pop())
        return subsample

    def fit(self, X, y):
        if self.max_features and self.max_features < X.shape[1]:
            self.n_features = self.max_features
        else:
            self.n_features = X.shape[1]

        self.forest = []
        bootstrap = self.get_bootstrap(X, y)
        for bX, by in bootstrap:
            self.forest.append(self.build_tree(bX, by))
        return self

    def predict(self, X):
        def predict_object(obj, node):
            if isinstance(node, Leaf):
                return node.prediction
            if obj[node.index] <= node.t:
                return predict_object(obj, node.true_branch)
            else:
                return predict_object(obj, node.false_branch)

        y = []
        for obj in X:
            prediction = []
            for tree in self.forest:
                prediction.append(predict_object(obj, tree))
            y.append(np.mean(prediction))
        return y

## Создание и тестирование модели

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

In [53]:
ntrees = [5, 10]
mleafs = [5, 7]
mdepths = [9, 11]

best = 0
best_params = {}
for nt in ntrees:
    for ml in mleafs:
        for md in mdepths:
            print(f'NTrees = {nt}, MinLeaf = {ml}, MaxDepth = {md}')
            model = RFR(n_trees=nt, min_samples_leaf=ml, max_depth=md)
            model.fit(X_train, y_train)
            score = print_score(model, X_train, y_train, X_test, y_test)
            if score > best:
                best_params['n_trees'] = nt
                best_params['min_samples_leaf'] = ml
                best_params['max_depth'] = md
                best = score

NTrees = 5, MinLeaf = 5, MaxDepth = 9
Train Score 0.7607
Test Score 0.7434
NTrees = 5, MinLeaf = 5, MaxDepth = 11
Train Score 0.7716
Test Score 0.7442
NTrees = 5, MinLeaf = 7, MaxDepth = 9
Train Score 0.7611
Test Score 0.7401
NTrees = 5, MinLeaf = 7, MaxDepth = 11
Train Score 0.7708
Test Score 0.7479
NTrees = 10, MinLeaf = 5, MaxDepth = 9
Train Score 0.7799
Test Score 0.7646
NTrees = 10, MinLeaf = 5, MaxDepth = 11
Train Score 0.7908
Test Score 0.7671
NTrees = 10, MinLeaf = 7, MaxDepth = 9
Train Score 0.7667
Test Score 0.7499
NTrees = 10, MinLeaf = 7, MaxDepth = 11
Train Score 0.7728
Test Score 0.7519


In [54]:
print('BEST PARAMS:')
for k in best_params:
    print(f' {k} = {best_params[k]}')

BEST PARAMS:
 n_trees = 10
 min_samples_leaf = 5
 max_depth = 11


## Предсказание

In [56]:
model = RFR(**best_params)
model.fit(X.values, y.values)

test[TARGET] = model.predict(test.values)
test.loc[:,[TARGET]].to_csv('predictions.csv')