Решение задачи для соревнования: https://www.kaggle.com/c/choose-tutors/overview

### i. Инициализация зависимостей

Инициализируем импорты

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
import matplotlib.pyplot as plt
import seaborn as sns

### ii. Инициализация классов и функций

Блок классов и функций

In [29]:
def get_bootstrap(data, labels, N):
    n_samples = data.shape[0]
    bootstrap = []
    
    for i in range(N):
        b_data = np.zeros(data.shape)
        b_labels = np.zeros(labels.shape)
        
        for j in range(n_samples):
            sample_index = random.randint(0, n_samples-1)
            b_data[j] = data.loc[sample_index]
            b_labels[j] = labels[sample_index]
        bootstrap.append((b_data, b_labels))
        
    return bootstrap

In [27]:
def get_subsample(len_sample):
    sample_indexes = [i for i in range(len_sample)]
    
    len_subsample = int(np.sqrt(len_sample))
    subsample = []
    
    random.shuffle(sample_indexes)
    for _ in range(len_subsample):
        subsample.append(sample_indexes.pop())
        
    return subsample

In [3]:
class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index
        self.t = t
        self.true_branch = true_branch
        self.false_branch = false_branch

In [4]:
class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.prediction = self.predict()
        
    def predict(self):
        classes = {}
        for label in self.labels:
            if label not in classes:
                classes[label] = 0
            classes[label] += 1    
        prediction = max(classes, key=classes.get)
        return prediction

In [5]:
def gini(labels):
    classes = {}
    for label in labels:
        if label not in classes:
            classes[label] = 0
        classes[label] += 1
    impurity = 1
    for label in classes:
        p = classes[label] / len(labels)
        impurity -= p ** 2
        
    return impurity

In [6]:
def quality(left_labels, right_labels, current_gini):
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])  
    return current_gini - p * gini(left_labels) - (1 - p) * gini(right_labels)

In [7]:
def split(data, labels, index, t):
    
    left = np.where(data[:, index] <= t)
    right = np.where(data[:, index] > t)
        
    true_data = data[left]
    false_data = data[right]
    true_labels = labels[left]
    false_labels = labels[right]
        
    return true_data, false_data, true_labels, false_labels

In [8]:
def find_best_split(data, labels):    
    min_leaf = 15

    current_gini = gini(labels)

    best_quality = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    for index in range(n_features):
        t_values = [row[index] for row in data]
        
        for t in t_values:
            true_data, false_data, true_labels, false_labels = split(data, labels, index, t)
            if len(true_data) < min_leaf or len(false_data) < min_leaf:
                continue
            
            current_quality = quality(true_labels, false_labels, current_gini)
            
            if current_quality > best_quality:
                best_quality, best_t, best_index = current_quality, t, index

    return best_quality, best_t, best_index

In [9]:
def build_tree(data, labels):

    quality, t, index = find_best_split(data, labels)

    if quality == 0:
        return Leaf(data, labels)

    true_data, false_data, true_labels, false_labels = split(data, labels, index, t)

    true_branch = build_tree(true_data, true_labels)
    false_branch = build_tree(false_data, false_labels)

    return Node(index, t, true_branch, false_branch)

In [10]:
def random_forest(data, labels, n_trees):
    forest = []
    bootstrap = get_bootstrap(data, labels, n_trees)
    
    for b_data, b_labels in bootstrap:
        forest.append(build_tree(b_data, b_labels))
        
    return forest

In [11]:
def classify_object(obj, node):

    if isinstance(node, Leaf):
        answer = node.prediction
        return answer

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)

In [12]:
def predict(data, tree):
    classes = []
    for obj in data.iterrows(): # for obj in data:
        prediction = classify_object(obj[1], tree)
        classes.append(prediction)
    return classes

In [13]:
def tree_vote(forest, data):

    predictions = []
    for tree in forest:
        predictions.append(predict(data, tree))
    
    predictions_per_object = list(zip(*predictions))
    
    voted_predictions = []
    for obj in predictions_per_object:
        voted_predictions.append(max(set(obj), key=obj.count))
        
    return voted_predictions

In [14]:
def custom_StandardScaler(x):
    res = (x - x.mean()) / x.std()
    return res

### iii. Обработка тренировочных данных

1. Откроем csv с тренировочным датасетом

In [15]:
train_dataset = pd.read_csv('./data/train.csv', sep=',')
train_dataset.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0


2. Проверим корректность типов данных

In [16]:
train_dataset.dtypes

Id                       int64
age                    float64
years_of_experience    float64
lesson_price           float64
qualification          float64
physics                float64
chemistry              float64
biology                float64
english                float64
geography              float64
history                float64
mean_exam_points       float64
choose                   int64
dtype: object

Типы в норме

3. Проверим датасет на наличие выбросов

In [17]:
train_dataset.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,45.8009,1.9748,1702.44,1.7243,0.3706,0.1215,0.1172,0.0591,0.0277,0.018,64.4352,0.1109
std,2886.89568,8.030274,1.766883,523.789062,0.798845,0.48299,0.326724,0.321675,0.235824,0.16412,0.132958,13.595024,0.314024
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0
25%,2499.75,40.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0
50%,4999.5,46.0,2.0,1550.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0
75%,7499.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0
max,9999.0,68.0,9.0,3950.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,1.0


Выбросов и потерь данных нет

4. Отбросим из датасета целевой признак и столбец ID

In [18]:
data = train_dataset.drop(['Id' , 'choose'], axis=1) 
data

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0
1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0
2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0
3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0
4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0
5,37.0,3.0,1050.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0
6,54.0,3.0,800.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,52.0
7,32.0,2.0,2750.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,95.0
8,56.0,3.0,1300.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0
9,44.0,4.0,2350.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0


5. Стандартизируем полученный датасет

In [19]:
data = custom_StandardScaler(data)
data

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,-1.345023,-1.117674,0.854466,0.345123,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.703552
1,0.771966,0.014262,-0.863783,0.345123,1.303134,-0.371874,2.744389,-0.250611,-0.168779,7.385804,-0.546906
2,-2.092195,0.580231,0.090800,-0.906684,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.115101
3,-1.594080,0.580231,-1.245616,-0.906684,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.115101
4,0.024794,0.580231,1.045383,0.345123,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.629995
5,-1.095965,0.580231,-1.245616,0.345123,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-0.767575
6,1.021024,0.580231,-1.722907,-0.906684,-0.767304,2.688816,-0.364343,-0.250611,-0.168779,-0.135381,-0.914688
7,-1.718609,0.014262,1.999965,1.596931,1.303134,-0.371874,2.744389,-0.250611,-0.168779,-0.135381,2.248234
8,1.270081,0.580231,-0.768325,0.345123,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-0.473350
9,-0.224264,1.146199,1.236299,0.345123,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.482883


6. Определим целевой столбец

In [20]:
labels = train_dataset['choose']

7. Разобьем датасет на тестовый и на тренировочный части

In [21]:
task_train_data, task_test_data, task_train_labels, task_test_labels = tts(data, 
                                                                                     labels, 
                                                                                     test_size = 0.3,
                                                                                     random_state = 1)


В ходе разбития поломались индесы. Исправим их.

In [22]:
task_train_data = task_train_data.reset_index(drop=True)
task_train_labels = task_train_labels.reset_index(drop=True)

8. Подытожим результат

In [23]:
task_train_data

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,1.270081,-0.551706,-0.386491,-0.906684,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.556439
1,-0.722379,0.580231,1.713591,1.596931,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,1.733340
2,0.273851,-1.117674,-0.863783,-0.906684,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-0.473350
3,-0.722379,0.014262,-0.291033,-0.906684,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-1.870920
4,2.017254,-0.551706,-0.959241,-0.906684,-0.767304,2.688816,-0.364343,-0.250611,-0.168779,-0.135381,-0.841131
5,2.141782,-1.117674,0.090800,0.345123,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,0.262214
6,0.149322,0.580231,-0.100117,-0.906684,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-0.473350
7,-0.099735,1.146199,2.668173,1.596931,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,1.954009
8,0.149322,-0.551706,1.236299,1.596931,1.303134,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,1.586227
9,-0.348793,-0.551706,-0.672866,-0.906684,-0.767304,-0.371874,-0.364343,-0.250611,-0.168779,-0.135381,-0.546906


In [25]:
task_train_labels

0       0
1       0
2       0
3       0
4       0
       ..
6995    0
6996    0
6997    0
6998    0
6999    0
Name: choose, Length: 7000, dtype: int64

### iv. Расчет модели

1. Проведем обучение модели на частях тренировочных данных

In [30]:
%%time
import random
n_trees = 1
my_forest_1 = random_forest(task_train_data, task_train_labels,n_trees)

Wall time: 11min 8s


In [84]:
task_test_answers_1 = tree_vote(my_forest_1, task_test_data)

2. Обратимся к благам цивилизации, и проверим свою модель по roc-auc метрике

In [85]:
from sklearn.metrics import roc_auc_score as rocauc

In [86]:
auc1 = rocauc(task_test_labels, task_test_answers_1)
auc1

0.6116175306032032

### v. Расчет на тестовом датасете

1. Импортируем тестовый датасет

In [71]:
test_dataset = pd.read_csv('./data/test.csv', sep=',')
test_dataset.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,10000,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,10001,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,10002,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,10003,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,10004,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0


2. Отдельно составим Series с ID-шниками

In [77]:
ID = test_dataset['Id']
ID

0       10000
1       10001
2       10002
3       10003
4       10004
        ...  
9995    19995
9996    19996
9997    19997
9998    19998
9999    19999
Name: Id, Length: 10000, dtype: int64

3. Подготавливаем датасет к обработке

In [74]:
test_dataset = test_dataset.drop(['Id'], axis=1)
test_dataset.head()

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0


In [75]:
test_data = custom_StandardScaler(test_dataset)
test_data

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,-1.733633,0.008074,1.900372,1.643400,-0.769773,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,1.904968
1,-1.360126,2.266408,0.190191,0.377006,1.298954,2.608778,-0.361874,-0.226979,-0.173422,-0.123814,0.504831
2,-0.239605,0.008074,-0.949930,-0.889389,-0.769773,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,-1.411147
3,-0.239605,1.137241,2.375422,1.643400,1.298954,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,2.052351
4,-0.986619,0.572657,-0.569889,0.377006,1.298954,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,-0.453158
5,1.005419,1.137241,0.665241,1.643400,1.298954,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,2.199734
6,-0.737614,-1.121094,-0.569889,-0.889389,-0.769773,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,-0.305775
7,-2.107140,-1.121094,-1.044940,-0.889389,1.298954,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,-0.305775
8,-0.239605,2.266408,2.280412,0.377006,-0.769773,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,0.283756
9,-0.115102,1.701824,-0.759909,-0.889389,1.298954,-0.383283,-0.361874,-0.226979,-0.173422,-0.123814,0.357448


4. Получаем результат

In [76]:
task_test_answers_1 = tree_vote(my_forest_1, test_data)

### 6. Экспорт результатов в файл

1. Формируем датафрейм из ID-шников и результатов расчета

In [82]:
result = pd.DataFrame(data={'Id' : ID, 'choose' : np.array(task_test_answers_1, dtype=int)})
result

Unnamed: 0,Id,choose
0,10000,0
1,10001,0
2,10002,0
3,10003,0
4,10004,0
5,10005,1
6,10006,0
7,10007,0
8,10008,0
9,10009,1


2. Формируем итоговую csv

In [83]:
result.to_csv('results.csv', index=False)