In [4]:
import random
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
def load_tutors_data(TUTOR_PATH):
    csv_path = os.path.join(TUTOR_PATH)
    return pd.read_csv(csv_path)

In [6]:
tutors_train = load_tutors_data("train.csv")
tutors_test = load_tutors_data("test.csv")

In [7]:
tutors_train['Id'].value_counts()

2047    1
9518    1
7481    1
5432    1
9526    1
       ..
2716    1
8857    1
4759    1
6806    1
0       1
Name: Id, Length: 10000, dtype: int64

In [8]:
tutors_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   10000 non-null  int64  
 1   age                  10000 non-null  float64
 2   years_of_experience  10000 non-null  float64
 3   lesson_price         10000 non-null  float64
 4   qualification        10000 non-null  float64
 5   physics              10000 non-null  float64
 6   chemistry            10000 non-null  float64
 7   biology              10000 non-null  float64
 8   english              10000 non-null  float64
 9   geography            10000 non-null  float64
 10  history              10000 non-null  float64
 11  mean_exam_points     10000 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 937.6 KB


In [9]:
tutors_train.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,45.878,1.9868,1699.105,1.7195,0.375,0.1329,0.1096,0.0537,0.0321,0.0194,64.3408
std,2886.89568,8.043929,1.772213,524.886654,0.792264,0.484147,0.339484,0.312406,0.225436,0.176274,0.137933,13.536823
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0
25%,2499.75,40.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0
50%,4999.5,46.0,2.0,1500.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0
75%,7499.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0
max,9999.0,68.0,10.0,3950.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0


In [10]:
tutor_train_copy = tutors_train.copy()
tutor_train_copy = tutor_train_copy.drop("mean_exam_points", axis=1)
tutor_train_copy = tutor_train_copy.drop("Id", axis=1)

In [11]:
tutor_test_copy = tutors_test.copy()
tutor_test_copy = tutor_test_copy.drop("Id", axis=1)

In [12]:
def StandartScaler(data):
    columns = data.columns
    for col in columns:
        min_val = np.min(data[col])
        max_val = np.max(data[col])
        for i in range(data.shape[0]):
            data[col].iloc[i] = (data[col].iloc[i] - min_val) / (max_val - min_val)
    return data

In [13]:
# tutor_train_copy = StandartScaler(tutor_train_copy)

In [14]:
target = tutors_train["mean_exam_points"]

In [15]:
tutor_train_copy.head()

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,40.0,0.0,1400.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,48.0,4.0,2850.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0
2,39.0,0.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,46.0,5.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,43.0,1.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# tutor_test_copy = StandartScaler(tutor_train_copy)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(tutor_train_copy, target, test_size=0.3, random_state=42)

#### В качестве модели мной будет использован случайный лес

In [18]:
random.seed(42)

def get_bootstrap(data, labels, N):
    n_samples = data.shape[0]
    bootstrap = []
    
    for i in range(N):
        b_data = np.zeros(data.shape)
        b_labels = np.zeros(labels.shape)
        
        for j in range(n_samples):
            sample_index = random.randint(0, n_samples-1)
            b_data[j] = data.iloc[sample_index]
            b_labels[j] = labels.iloc[sample_index]
        bootstrap.append((b_data, b_labels))
        
    return bootstrap

In [19]:
def get_subsample(len_sample):
    # будем сохранять не сами признаки, а их индексы
    sample_indexes = [i for i in range(len_sample)]
    
    len_subsample = int(np.sqrt(len_sample))
    subsample = []
    
    random.shuffle(sample_indexes)
    for _ in range(len_subsample):
        subsample.append(sample_indexes.pop())
        
    return subsample

In [20]:
# Реализуем класс узла

class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

In [21]:
# И класс терминального узла (листа)

class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.prediction = self.predict()
        
    def predict(self):
        prediction = np.mean(self.labels)
        return prediction        

In [22]:
def varience(labels):
    return np.var(labels)

In [23]:
def quality(left_labels, right_labels, current_var):

    # доля выбоки, ушедшая в левое поддерево
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
    
    return current_var - p * varience(left_labels) - (1 - p) * varience(right_labels)

In [24]:
def split(data, labels, index, t):
    
    left = np.where(data[:, index] <= t)
    right = np.where(data[:, index] > t)
        
    true_data = data[left]
    false_data = data[right]
    true_labels = labels[left]
    false_labels = labels[right]
        
    return true_data, false_data, true_labels, false_labels

In [25]:
def get_values(t_val, n):
    t_val = np.sort(t_val)
    new_value = []
    temp = []
    for i, item in enumerate(t_val):
        if (i%n == 0) or (i == len(t_val) - 1):
            new_value.append(np.mean(temp))
            temp = []
        else:
            temp.append(item)

    return new_value      

In [43]:
def find_best_split(data, labels, leaf_min):
    
    #  обозначим минимальное количество объектов в узле
    min_leaf = leaf_min

    current_var = varience(labels)

    best_quality = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    # выбор индекса из подвыборки длиной sqrt(n_features)
    subsample = get_subsample(n_features)
    
    for index in subsample:
        # будем проверять только уникальные значения признака, исключая повторения
        t_values = [row[index] for row in data]
        
        # выбор t_value, исходя из средних значений подвыборок
        for line in range(10, 511, 50):
            t_values = get_values(t_values, line)
        
            for t in t_values:
                true_data, false_data, true_labels, false_labels = split(data, labels, index, t)
                #  пропускаем разбиения, в которых в узле остается менее 5 объектов
                if len(true_data) < min_leaf or len(false_data) < min_leaf:
                    continue
            
                current_quality = quality(true_labels, false_labels, current_var)
            
                #  выбираем порог, на котором получается максимальный прирост качества
                if current_quality > best_quality:
                    best_quality, best_t, best_index = current_quality, t, index

    return best_quality, best_t, best_index

In [68]:
def build_tree(data, labels, leaf_min):

    quality, t, index = find_best_split(data, labels, leaf_min)

    #  Базовый случай - прекращаем рекурсию, когда нет прироста в качества
    if quality == 0:
        return Leaf(data, labels)

    true_data, false_data, true_labels, false_labels = split(data, labels, index, t)

    # Рекурсивно строим два поддерева
    true_branch = build_tree(true_data, true_labels, leaf_min)
    false_branch = build_tree(false_data, false_labels, leaf_min)

    # Возвращаем класс узла со всеми поддеревьями, то есть целого дерева
    return Node(index, t, true_branch, false_branch)

In [75]:
def random_forest(data, labels, n_trees, leaf_min):
    forest = []
    bootstrap = get_bootstrap(data, labels, n_trees)
    
    for b_data, b_labels in bootstrap:
        forest.append(build_tree(b_data, b_labels, leaf_min))
        
    return forest

In [76]:
def classify_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        answer = node.prediction
        return answer

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)

In [77]:
def predict(data, tree):
    
    classes = []
    for obj in range(data.shape[0]):
        prediction = classify_object(data.iloc[obj], tree)
        classes.append(prediction)
    return classes

In [78]:
def tree_vote(forest, data):

    # добавим предсказания всех деревьев в список
    predictions = []
    for tree in forest:
        predictions.append(predict(data, tree))
    
    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    
    # выберем в качестве итогового предсказания для каждого объекта то,
    # за которое проголосовало большинство деревьев
    voted_predictions = []
    for obj in predictions_per_object:
        voted_predictions.append(np.mean(obj))
        
    return voted_predictions

In [79]:
from sklearn.metrics import r2_score
# def r2(predict, actual):
#     return (1 - np.(actual) / np.var(actual - predict))

In [80]:
n_trees = 40
min_leaf_n = 6
optimal_leaf = 0
score = 0
best_score = 0
my_forest = 0


for leaf_min in range(2, min_leaf_n):
    my_forest_1 = random_forest(X_train, y_train, n_trees, leaf_min)
    train_answers = tree_vote(my_forest_1, X_train)
    if r2_score(train_answers, y_train) > score:
        my_forest = my_forest_1
        score = r2_score(train_answers, y_train)
        optimal_leaf = leaf_min
        print(score)
        
print(f'Оптимальные параметры модели: max_depth = {optimal_depth}, min_leaf = {optimal_leaf}, best_score = {score}')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


0.7840382087354661


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Оптимальные параметры модели: max_depth = 8, min_leaf = 2, best_score = 0.7840382087354661


In [81]:
# Получим ответы для обучающей выборки 
train_answers = tree_vote(my_forest, X_train)

In [82]:
r2_score(train_answers, y_train)

0.7840382087354661

In [83]:
test_answers = tree_vote(my_forest, X_test)

In [84]:
r2_score(test_answers, y_test)

0.689389506806874

In [85]:
test_answers = tree_vote(my_forest, tutor_test_copy)

In [86]:
print(test_answers[:5])

[56.5273906645884, 63.035239529712314, 48.9154336744778, 89.98527070517203, 88.81123956734763]


In [87]:
df_test_2 = pd.DataFrame(test_answers, columns=['mean_exam_points'])
df_test_2

Unnamed: 0,mean_exam_points
0,56.527391
1,63.035240
2,48.915434
3,89.985271
4,88.811240
...,...
9995,42.661614
9996,79.045209
9997,53.689442
9998,65.027513


In [88]:
df_test_1 = pd.read_csv("submission_example.csv", delimiter=',')
df_test_1['Id']

0       10000
1       10001
2       10002
3       10003
4       10004
        ...  
9995    19995
9996    19996
9997    19997
9998    19998
9999    19999
Name: Id, Length: 10000, dtype: int64

In [89]:
df_test = pd.concat([df_test_1['Id'], df_test_2['mean_exam_points']], axis=1)
df_test.to_csv('submission.csv', index=False)