# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

# ML Toolkit
from robusta.crossval import *

%load_ext memory_profiler

# Binary Classification

## Data

In [2]:
from catboost.datasets import amazon

X_train, X_test = amazon()

y_train = X_train['ACTION']
X_train.drop(columns='ACTION', inplace=True)

X_train.index.name = 'id'
X_test.set_index('id', inplace=True)

X_train

Unnamed: 0_level_0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,42680,5905,117929,117930,119569,119323,123932,19793,119325
...,...,...,...,...,...,...,...,...,...
32764,23497,16971,117961,118300,119993,118321,240983,290919,118322
32765,25139,311198,91261,118026,122392,121143,173805,249618,121145
32766,34924,28805,117961,118327,120299,124922,152038,118612,124924
32767,80574,55643,118256,118257,117945,280788,280788,292795,119082


## Task

In [3]:
scoring = 'roc_auc'
cv = 5

## Model

In [4]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

## Predict Probability

### Averaging (used by default)

In [5]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=-1)

y_pred

[07:23:19]  LGBMClassifier

[07:23:21]  FOLD  0:   0.8288
[07:23:21]  FOLD  1:   0.8555
[07:23:21]  FOLD  2:   0.8260
[07:23:21]  FOLD  3:   0.8286
[07:23:21]  FOLD  4:   0.8533

[07:23:22]  AVERAGE:   [33m0.8384[0m ± 0.0131



id
1        0.916153
2        0.971869
3        0.971752
4        0.968436
5        0.986602
           ...   
58917    0.971509
58918    0.923720
58919    0.942652
58920    0.955126
58921    0.986097
Name: ACTION, Length: 58921, dtype: float64

In [6]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             avg_type='auto', verbose=2, n_jobs=-1)

y_pred # the same

[07:23:22]  LGBMClassifier

[07:23:24]  FOLD  0:   0.8288
[07:23:24]  FOLD  1:   0.8555
[07:23:24]  FOLD  2:   0.8260
[07:23:24]  FOLD  3:   0.8286
[07:23:24]  FOLD  4:   0.8533

[07:23:24]  AVERAGE:   [33m0.8384[0m ± 0.0131



id
1        0.916153
2        0.971869
3        0.971752
4        0.968436
5        0.986602
           ...   
58917    0.971509
58918    0.923720
58919    0.942652
58920    0.955126
58921    0.986097
Name: ACTION, Length: 58921, dtype: float64

In [7]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             avg_type='mean', verbose=2, n_jobs=-1)

y_pred # the same

[07:23:24]  LGBMClassifier

[07:23:27]  FOLD  0:   0.8288
[07:23:27]  FOLD  1:   0.8555
[07:23:27]  FOLD  2:   0.8260
[07:23:27]  FOLD  3:   0.8286
[07:23:27]  FOLD  4:   0.8533

[07:23:27]  AVERAGE:   [33m0.8384[0m ± 0.0131



id
1        0.916153
2        0.971869
3        0.971752
4        0.968436
5        0.986602
           ...   
58917    0.971509
58918    0.923720
58919    0.942652
58920    0.955126
58921    0.986097
Name: ACTION, Length: 58921, dtype: float64

### Rank Averaging

In [8]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, method='predict_proba',
                             avg_type='rank', verbose=2, n_jobs=-1)

y_pred

[07:23:27]  LGBMClassifier

[07:23:30]  FOLD  0:   0.8288
[07:23:30]  FOLD  1:   0.8555
[07:23:30]  FOLD  2:   0.8260
[07:23:30]  FOLD  3:   0.8286
[07:23:30]  FOLD  4:   0.8533

[07:23:30]  AVERAGE:   [33m0.8384[0m ± 0.0131



id
1        0.181859
2        0.544492
3        0.548884
4        0.507771
5        0.852270
           ...   
58917    0.534672
58918    0.229480
58919    0.271974
58920    0.357107
58921    0.846123
Name: ACTION, Length: 58921, dtype: float64

## Predict Classes

### Soft Vote (used by default)

In [9]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, verbose=2, n_jobs=-1)

y_pred.value_counts()

[07:23:31]  LGBMClassifier

[07:23:33]  FOLD  0:   0.8288
[07:23:33]  FOLD  1:   0.8555
[07:23:33]  FOLD  2:   0.8260
[07:23:33]  FOLD  3:   0.8286
[07:23:33]  FOLD  4:   0.8533

[07:23:33]  AVERAGE:   [33m0.8384[0m ± 0.0131



1    58512
0      409
dtype: int64

In [10]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, avg_type='soft',
                             verbose=2, n_jobs=-1)

y_pred.value_counts() # the same

[07:23:33]  LGBMClassifier

[07:23:36]  FOLD  0:   0.8288
[07:23:36]  FOLD  1:   0.8555
[07:23:36]  FOLD  2:   0.8260
[07:23:36]  FOLD  3:   0.8286
[07:23:36]  FOLD  4:   0.8533

[07:23:36]  AVERAGE:   [33m0.8384[0m ± 0.0131



1    58512
0      409
dtype: int64

In [11]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, avg_type='auto',
                             verbose=2, n_jobs=-1)

y_pred.value_counts() # the same

[07:23:36]  LGBMClassifier

[07:23:39]  FOLD  0:   0.8288
[07:23:39]  FOLD  1:   0.8555
[07:23:39]  FOLD  2:   0.8260
[07:23:39]  FOLD  3:   0.8286
[07:23:39]  FOLD  4:   0.8533

[07:23:40]  AVERAGE:   [33m0.8384[0m ± 0.0131



1    58512
0      409
dtype: int64

### Hard Vote

In [12]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, avg_type='hard',
                             verbose=2, n_jobs=-1)

y_pred.value_counts() # differs

[07:23:40]  LGBMClassifier

[07:23:43]  FOLD  0:   0.8288
[07:23:43]  FOLD  1:   0.8555
[07:23:43]  FOLD  2:   0.8260
[07:23:43]  FOLD  3:   0.8286
[07:23:43]  FOLD  4:   0.8533

[07:24:00]  AVERAGE:   [33m0.8384[0m ± 0.0131



1    58478
0      443
dtype: int64

# Regression

## Task

In [13]:
scoring = 'r2'
cv = 5

## Model

In [14]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()

In [15]:
_, y_pred = crossval_predict(model, cv, X_train, y_train, X_new=X_test,
                             scoring=scoring, verbose=2, n_jobs=-1)

y_pred

[07:24:00]  LGBMRegressor

[07:24:03]  FOLD  0:   0.1796
[07:24:03]  FOLD  1:   0.1956
[07:24:03]  FOLD  2:   0.1625
[07:24:03]  FOLD  3:   0.1684
[07:24:03]  FOLD  4:   0.1985

[07:24:03]  AVERAGE:   [33m0.1809[0m ± 0.0143



id
1        0.878580
2        0.968145
3        0.963126
4        0.969735
5        0.979322
           ...   
58917    0.962907
58918    0.917970
58919    0.941375
58920    0.955532
58921    0.985731
Name: ACTION, Length: 58921, dtype: float64