# Построение модели и предсказаний

In [0]:
#!pip install imblearn
#!pip install xgboost

In [10]:
import pandas as pd
import numpy as np

%pylab inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

Populating the interactive namespace from numpy and matplotlib


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Дообработка обучающей выборки

In [0]:
reaction_df = pd.read_csv('/content/drive/My Drive/tinkoff_data/refined_data/final_train.csv')
reaction_test = pd.read_csv('/content/drive/My Drive/tinkoff_data/refined_data/final_test.csv')

In [14]:
reaction_test.isna().sum()

age                0
children_cnt       0
children_cnt_na    0
g_F                0
g_M                0
                  ..
wednesday          0
thursday           0
friday             0
saturday           0
sunday             0
Length: 93, dtype: int64

In [0]:
X, y = reaction_df.drop(columns=['event']), reaction_df['event']

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2)

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled = pd.DataFrame(y_resampled, columns=['target'])['target']
X_train, y_train = X_resampled, y_resampled

## Функции для моделирования

In [0]:
#Простая кросс-валидация
def test_modelling(Model):
    scores = cross_val_score(Model, X, y, cv=3)
    print(np.mean(scores))

In [0]:
#Переход от классификации реакции к оценке
def transform_results(predictions, alpha, is_rough=False): 
    is_good = predictions[:,1] + predictions[:,3] #Вероятность, что лайк + вероятность, что посмотрят
    is_bad = predictions[:,0] + predictions[:,2] #Вероятность, что дизлайк + вероятность, что пропустят
    result = is_good - is_bad 
    if is_rough:
        result[result == 0] = -1
    result[result > alpha] = 1
    result[result < -alpha] = -1
    return result

In [0]:
#Проверка с помощью оверсэмплинга на основе матрицы значений
def over_sampling_check(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

    X_resampled = pd.DataFrame(X_resampled, columns=X_train.columns)
    y_resampled = pd.DataFrame(y_resampled, columns=['target'])['target']
    X_train, y_train = X_resampled, y_resampled
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    m = np.array(confusion_matrix(y_test, predictions, labels=['like', 'view', 'skip', 'dislike']))
    print(m)
    return (m[0:2, 0:2].sum() + m[2:4, 2:4].sum())/m.sum()

In [0]:
#Словарь значений, которые использовали организаторы при оценке
label_to_vc_value = {'dislike':-10, 'like':0.5, 'skip':-0.1, 'view':0.1}

#Проверка работы алгоритма при разных способах перехода от классификации реакции к бинарной классификации
def testing_model_by_vc_grader(model, X_train, y_train, X_valid, y_valid,
                               alpha=0.2, beta=1, transforming='classic'):
    y_true = np.array([label_to_vc_value[val] for val in y_valid])
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_valid)
    result = (predictions[:,1]*beta + predictions[:,3]) - (predictions[:,0]*beta + predictions[:,2])
    if transforming == 'classic':
        result[result >= 0] = 1
        result[result <= 0] = -1
    elif transforming == 'thresh':
        result[result >= alpha] = 1
        result[result <= -alpha] = -1
    elif transforming == 'multiple':
        results = []
        for value in alpha:
            result = (predictions[:,1]*beta + predictions[:,3]) - (predictions[:,0]*beta + predictions[:,2])
            result[result >= value] = 1
            result[result <= -value] = -1
            results.append([value, sum(y_true*result)/sum(abs(y_true))])
        return results
    elif transforming == 'multiple_thresh':
        results = []
        for value in alpha:
            result = (predictions[:,1]*beta + predictions[:,3]) - (predictions[:,0]*beta + predictions[:,2])
            result[result >= value] = 1
            result[result <= value] = -1
            results.append([value, sum(y_true*result)/sum(abs(y_true))])
        return results
    return sum(y_true*result)/sum(abs(y_true))

In [0]:
label_to_value = {'dislike':-1, 'like':1, 'skip':-1, 'view':1}
#Вспомогательная функция для создания пользовательского scorer'а

def score_func_tovalue(y, y_pred, transforming='classic', alpha=0, beta=1):
    y_true = np.array([label_to_value[val] for val in y])
    result = (y_pred[:,1]*beta + y_pred[:,3]) - (y_pred[:,0]*beta + y_pred[:,2])
    result[result >= alpha] = 1
    result[result <= -alpha] = -1
    return mean_squared_error(y_true, result)

In [0]:
#Пользовательские scorer'ы

new_scorer = make_scorer(score_func_tovalue, needs_proba=True)
new_scorer2 = make_scorer(score_func_tovalue, needs_proba=True, greater_is_better=False, transforming='thresh')

## Моделирование

In [0]:
#Подбор гиперпараметров на XGBoost

params = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)],
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
}
model = XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=4, gamma=0.1, n_estimators=400, kvargs={'tree_method':'gpu_hist'})
clf = RandomizedSearchCV(model, params, n_iter=5, cv=2, scoring=new_scorer2, verbose=3)
clf.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] subsample=0.85, reg_alpha=0.01, colsample_bytree=0.8 ............
[CV]  subsample=0.85, reg_alpha=0.01, colsample_bytree=0.8, score=-1.114, total=34.9min
[CV] subsample=0.85, reg_alpha=0.01, colsample_bytree=0.8 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 34.9min remaining:    0.0s


[CV]  subsample=0.85, reg_alpha=0.01, colsample_bytree=0.8, score=-1.092, total=35.2min
[CV] subsample=0.85, reg_alpha=0.001, colsample_bytree=0.8 ...........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 70.1min remaining:    0.0s


[CV]  subsample=0.85, reg_alpha=0.001, colsample_bytree=0.8, score=-1.114, total=35.5min
[CV] subsample=0.85, reg_alpha=0.001, colsample_bytree=0.8 ...........
[CV]  subsample=0.85, reg_alpha=0.001, colsample_bytree=0.8, score=-1.085, total=36.3min
[CV] subsample=0.85, reg_alpha=0.001, colsample_bytree=0.85 ..........
[CV]  subsample=0.85, reg_alpha=0.001, colsample_bytree=0.85, score=-1.112, total=37.6min
[CV] subsample=0.85, reg_alpha=0.001, colsample_bytree=0.85 ..........
[CV]  subsample=0.85, reg_alpha=0.001, colsample_bytree=0.85, score=-1.085, total=36.8min
[CV] subsample=0.8, reg_alpha=0.005, colsample_bytree=0.85 ...........
[CV]  subsample=0.8, reg_alpha=0.005, colsample_bytree=0.85, score=-1.102, total=37.7min
[CV] subsample=0.8, reg_alpha=0.005, colsample_bytree=0.85 ...........
[CV]  subsample=0.8, reg_alpha=0.005, colsample_bytree=0.85, score=-1.085, total=37.8min
[CV] subsample=0.85, reg_alpha=0.05, colsample_bytree=0.8 ............
[CV]  subsample=0.85, reg_alpha=0.05, 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 361.6min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0.1,
                                           kvargs={'tree_method': 'gpu_hist'},
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=6, min_child_weight=4,
                                           missing=None, n_estimators=400,
                                           n_jobs=1, nthread=None,
                                           objective='binary:...
                                           verbosity=1),
                   iid='warn', n_iter=5, n_jobs=None,
                   param_distributions={'colsample_bytree': [0.75, 0.8, 0.85],
                                

In [0]:
(clf.best_params_, clf.best_score_)

({'colsample_bytree': 0.85, 'reg_alpha': 0.005, 'subsample': 0.8},
 -1.0932464045586627)

In [0]:
n_estimators = [135, 140, 145, 150]
max_depth = [36, 38, 40, 42]
min_samples_split = [3, 5]
min_samples_leaf = [2, 5]
hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

In [0]:
#Подбор гиперпараметров на RandomForest

model = RandomForestClassifier()
clf = RandomizedSearchCV(model, hyperF, cv=3, n_iter=10, scoring=new_scorer2, verbose=3)
clf.fit(X_train, y_train)

(clf.best_params_, clf.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36 
[CV]  n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36, score=-0.706, total= 8.2min
[CV] n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.2min remaining:    0.0s


[CV]  n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36, score=-0.701, total= 8.1min
[CV] n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 16.3min remaining:    0.0s


[CV]  n_estimators=150, min_samples_split=3, min_samples_leaf=5, max_depth=36, score=-0.601, total= 7.5min
[CV] n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40 
[CV]  n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40, score=-0.605, total= 7.3min
[CV] n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40 
[CV]  n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40, score=-0.598, total= 7.1min
[CV] n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40 
[CV]  n_estimators=140, min_samples_split=3, min_samples_leaf=2, max_depth=40, score=-0.427, total= 7.3min
[CV] n_estimators=135, min_samples_split=3, min_samples_leaf=5, max_depth=42 
[CV]  n_estimators=135, min_samples_split=3, min_samples_leaf=5, max_depth=42, score=-0.699, total= 6.6min
[CV] n_estimators=135, min_samples_split=3, min_samples_leaf=5, max_depth=42 
[CV]  n_estimators=135, min_samples_split=3, min_samples_leaf=5, max_depth=

Параметры BaggingClassifier были подобраны аналогично RandomForestClassifier.

In [0]:
#Объединение предыдущих трёх моделей в VotingClassifier

model1 = BaggingClassifier(DecisionTreeClassifier(max_depth=40), n_estimators=120, max_samples=200000, max_features=0.6)
model2 = XGBClassifier(learning_rate=0.005, gamma=0.1, n_estimators=400, max_depth=6, min_child_weight=4, subsample=0.8, colsample_bytree=0.85, kvargs={'tree_method':'gpu_hist'})
model3 = RandomForestClassifier(n_estimators=150, min_samples_split=3, min_samples_leaf=2, max_depth=42)
model4 = VotingClassifier(estimators=[('rf', model3), 
                                       ('gb', model2),
                                     ('bc', model1)], voting='soft', weights=[1, 1, 1])

In [19]:
#Проверка работы

%%time
testing_model_by_vc_grader(model4,
                           X_train, y_train, X_valid, y_valid, 
                           alpha=np.arange(0,1,0.1), transforming='multiple')

CPU times: user 1h 55min 43s, sys: 5.27 s, total: 1h 55min 48s
Wall time: 1h 55min 54s


[[0.0, 0.2602035579532531],
 [0.1, 0.2625371986525398],
 [0.2, 0.24480689378734768],
 [0.30000000000000004, 0.22262944576608315],
 [0.4, 0.19419545881097047],
 [0.5, 0.17460195323910171],
 [0.6000000000000001, 0.1546384462358769],
 [0.7000000000000001, 0.14044802968341943],
 [0.8, 0.13361943033579157],
 [0.9, 0.130997637771487]]

## Сборка ответа

In [20]:
%%time
model4.fit(X, y)

CPU times: user 1h 1min 59s, sys: 10.1 s, total: 1h 2min 9s
Wall time: 1h 2min 14s


VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=42,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=2,
                                                     min_samples_split=3,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=150,
                                                     n_jobs=None,
            

In [29]:
reaction_test = reaction_test.drop(columns=['answer_id'])
predictions = model4.predict_proba(reaction_test)
predictions

array([[0.0036697 , 0.19037236, 0.24773926, 0.55821866],
       [0.00905989, 0.18622493, 0.16862265, 0.63609252],
       [0.0027843 , 0.15074268, 0.26865843, 0.57781458],
       ...,
       [0.00696839, 0.0332706 , 0.57441983, 0.38534118],
       [0.00631807, 0.04526558, 0.53827998, 0.41013637],
       [0.00670679, 0.08561341, 0.42737048, 0.48030933]])

In [0]:
def transform_results(predictions): 
    is_good = predictions[:,1] + predictions[:,3]
    is_bad = predictions[:,0] + predictions[:,2]
    result = is_good - is_bad
    result[result >= 0.2] = 1
    result[result <= 0] = -1
    return result

In [31]:
transformed = transform_results(predictions)
transformed

array([ 1.        ,  1.        ,  1.        , ..., -1.        ,
       -1.        ,  0.13184547])

In [32]:
df_transformed = pd.DataFrame(transformed)
df_transformed.columns=['score']
df_transformed.index.name = 'answer_id'
df_transformed.head()

Unnamed: 0_level_0,score
answer_id,Unnamed: 1_level_1
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [0]:
df_transformed.to_csv("submission.csv", sep=',', index=True)