In [128]:
import pandas as pd
import numpy as np
import luigi
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datetime import datetime, date, time, timedelta
from sklearn import set_config

from functions import reduce_mem_usage

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn #, Pipeline
from collections import Counter


import time

In [2]:
%matplotlib inline

warnings.simplefilter("ignore")
pd.set_option("display.max_columns", 999)

In [3]:
RANDOM_STATE = 42

In [4]:
TRAIN_PATH = "data/data_train.csv"
TEST_PATH = "data/data_test.csv"
FEATURES_PATH = "data/features.csv"

data_train = pd.read_csv(TRAIN_PATH)
data_test = pd.read_csv(TEST_PATH)

In [5]:
data_train = reduce_mem_usage(data_train)
data_test = reduce_mem_usage(data_test)

Memory usage of the dataframe is 31.73 MB
Memory usage after optimization is: 15.86 MB
Decreased by 50.0%
Memory usage of the dataframe is 2.17 MB
Memory usage after optimization is: 1.09 MB
Decreased by 50.0%


In [6]:
def process_featues(data_features, train, test):
    ids = np.unique(train['id'].append(test['id']))
    # Возьмем id только тех пользователей, что встречаются в трейн и тест выборках.
    data_features = data_features[data_features['id'].isin(ids)]
    # Дропнем Unnamed
    if 'Unnamed: 0' in data_features:
        data_features = data_features.drop(columns='Unnamed: 0')

    data_features = data_features.compute()
    # удалим признаки с единственным значением
    df_nunique = data_features.apply(lambda x: x.nunique(dropna=False))
    const = df_nunique[df_nunique ==1].index.tolist()
    data_features = data_features.drop(columns = const)
    # функция сжатия данных
    data_features = reduce_mem_usage(data_features)
    return data_features

In [7]:
import dask.dataframe as dd

data_features = dd.read_csv(FEATURES_PATH, sep ='\t')

data_features =process_featues(data_features, data_train, data_test)

Memory usage of the dataframe is 1728.06 MB
Memory usage after optimization is: 867.47 MB
Decreased by 49.8%


In [129]:
train = data_train.merge(data_features, on='id', how = 'left')
valid = data_test.merge(data_features, on='id', how = 'left')
del train['Unnamed: 0']
del valid['Unnamed: 0']

## data pipe

In [130]:
# TRAIN_PATH = "data/train_merge.csv"
# TEST_PATH = "data/test_merge.csv"

In [131]:
# train = reduce_mem_usage(pd.read_csv(TRAIN_PATH))
# test = reduce_mem_usage(pd.read_csv(TEST_PATH))

In [132]:
features = [f for f in data_features.columns if f not in ['buy_time','id']]

Разделим данные на X и y

In [133]:
X_train = train.drop('target', axis='columns')
y_train = train.target
X_valid = valid

In [134]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Сохраним необходимые для ответа признаки в отдельную переменную, чтобы позже добавить target.

In [135]:
answers_test = X_valid[['id', 'vas_id']]
answers_test['buy_time'] = X_valid['buy_time_x']
answers_test.shape

(73666, 3)

Разделим признаки на бинарные, категориальные и вещественные.

In [136]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X_train[features].fillna(0):
    val_count = len(X_train[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


Селектор колонок

In [137]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [138]:
new_features_list = ['interval']
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)

        try:
            if 'interval' in self.features_list:
                X['interval'] = X['buy_time_y'] - X['buy_time_x']

            return X
        except KeyError:            
            raise KeyError("DataFrame не содердит колонки buy_time")

Обработчик вещественных признаков
Поскольку вещественных признаков много больше чем предполагается оставить в конце, и чем остальных признаков, откинем менее значимые из них до объединения с другими признаками.

In [139]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
    ('nskb', SelectKBest(k=128, score_func=f_classif)),
])

Обработчик категориальных признаков

In [140]:
cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

Обработчик булевых признаков

In [141]:
bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

Собираем в общий пайплайн

In [142]:
transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

In [143]:
transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


Отберем признаки с помощью SelectKBest и логистической регрессии с регуляризацией L1 (было 298 признаков)


In [144]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=64, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [145]:
set_config(display='diagram')

fs_pipe

In [146]:
fs_pipe.fit(X_train, y_train)


In [147]:
# fs_pipe.transform(X_test).shape[1]

In [148]:
# joblib.dump(fs_pipe, 'models/data_pipeline.pkl', compress=9)

In [149]:
# pipe_clone = joblib.load('models/data_pipeline.pkl')

In [150]:
# type(pipe_clone)

In [151]:
# if X_test:
#     X_test = fs_pipe.transform(X_test)

In [152]:
X_train = fs_pipe.transform(X_train)

In [153]:
X_valid = fs_pipe.transform(X_valid)

In [154]:
# type(X_train_transform)

In [155]:
# X_train_transform.shape

In [156]:
# joblib.dump(X_train_transform, 'data/changes/X_train_transform.pkl', compress=9)
# joblib.dump(X_test_transform, 'data/changes/X_test_transform.pkl', compress=9)
# joblib.dump(X_valid_transform, 'data/changes/X_valid_transform.pkl', compress=9)

# joblib.dump(y_train, 'data/changes/y_train.pkl', compress=9)
# joblib.dump(y_test, 'data/changes/y_test.pkl', compress=9)

## Model

Соберем финальный пайплайн.


In [157]:
# model_pipe = Pipeline([
#     ('fs_pipe', fs_pipe),
#     ('ros', RandomOverSampler(random_state=RANDOM_STATE)),
#     ('cbc', CatBoostClassifier(random_state=RANDOM_STATE)),
# ])

In [159]:
# model_pipe = make_pipeline_imblearn(
#     transform_pipe,
#     SelectKBest(k=64, score_func=f_classif),
#     SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
#     RandomOverSampler(random_state=42),
#     CatBoostClassifier(random_state=RANDOM_STATE),
# )

In [None]:
# model_pipe.fit(X_train, y_train)

In [None]:
# y_valid = model_pipe.predict(X_valid)
# y_valid.shape

In [None]:
# def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred, model, X_test, digits=3):
#     print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred, digits=digits))
#     print('TEST\n\n' + classification_report(y_test_true, y_test_pred, digits=digits))
#     print('CONFUSION MATRIX\n')
#     print(pd.crosstab(y_test_true, y_test_pred))
#     plot_confusion_matrix(model, X_test, y_test_true, cmap=plt.cm.Blues, normalize='all')

In [None]:
# def test_model(X_train, y_train, model):
#     start = time.time()
#     model.fit(X_train, y_train)
#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)
#     y_test_pred_proba = model.predict_proba(X_test)
#     get_classification_report(y_train, y_train_pred, y_test, y_test_pred, model, X_test)
#     train_time = time.time() - start
#     print(f"train time = {train_time}")
#     return model, y_test, y_test_pred_proba, f1_score(y_test, y_test_pred, average='macro')

In [160]:
ros = RandomOverSampler(random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [161]:
model = CatBoostClassifier(random_state=RANDOM_STATE)


In [164]:
model.fit(X_ros, y_ros)


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.240211
0:	learn: 0.5115008	total: 1.02s	remaining: 17m 1s
1:	learn: 0.4230841	total: 2.02s	remaining: 16m 46s
2:	learn: 0.3710476	total: 3.01s	remaining: 16m 41s
3:	learn: 0.3485202	total: 3.9s	remaining: 16m 10s
4:	learn: 0.3364511	total: 4.91s	remaining: 16m 16s
5:	learn: 0.3307360	total: 5.99s	remaining: 16m 31s
6:	learn: 0.3269573	total: 7.02s	remaining: 16m 36s
7:	learn: 0.3240631	total: 8.08s	remaining: 16m 42s
8:	learn: 0.3210473	total: 8.97s	remaining: 16m 28s
9:	learn: 0.3196100	total: 9.88s	remaining: 16m 18s
10:	learn: 0.3182116	total: 10.7s	remaining: 15m 58s
11:	learn: 0.3175304	total: 11.5s	remaining: 15m 50s
12:	learn: 0.3169771	total: 12.2s	remaining: 15m 28s
13:	learn: 0.3156702	total: 13.2s	remaining: 15m 27s
14:	learn: 0.3154386	total: 14.1s	remaining: 15m 24s
15:	learn: 0.3150561	total: 14.9s	remaining: 15m 15s
16:	learn: 0.3149295	total: 15.6s	remaining: 15m 2s
17:	learn: 0.3146132	total: 16.4s	remaining: 14m 56s
18:	learn: 0.3145004	total: 1

153:	learn: 0.3008157	total: 1m 51s	remaining: 10m 13s
154:	learn: 0.3007126	total: 1m 52s	remaining: 10m 13s
155:	learn: 0.3006147	total: 1m 53s	remaining: 10m 13s
156:	learn: 0.3005321	total: 1m 54s	remaining: 10m 12s
157:	learn: 0.3004260	total: 1m 55s	remaining: 10m 12s
158:	learn: 0.3003212	total: 1m 55s	remaining: 10m 12s
159:	learn: 0.3002463	total: 1m 56s	remaining: 10m 11s
160:	learn: 0.3001725	total: 1m 57s	remaining: 10m 10s
161:	learn: 0.3000837	total: 1m 57s	remaining: 10m 10s
162:	learn: 0.3000229	total: 1m 58s	remaining: 10m 9s
163:	learn: 0.2999461	total: 1m 59s	remaining: 10m 8s
164:	learn: 0.2998461	total: 2m	remaining: 10m 7s
165:	learn: 0.2997396	total: 2m	remaining: 10m 6s
166:	learn: 0.2996613	total: 2m 1s	remaining: 10m 5s
167:	learn: 0.2995911	total: 2m 2s	remaining: 10m 4s
168:	learn: 0.2995058	total: 2m 2s	remaining: 10m 3s
169:	learn: 0.2994371	total: 2m 3s	remaining: 10m 2s
170:	learn: 0.2993447	total: 2m 4s	remaining: 10m 1s
171:	learn: 0.2992599	total: 2m 

306:	learn: 0.2881151	total: 3m 45s	remaining: 8m 28s
307:	learn: 0.2880220	total: 3m 45s	remaining: 8m 27s
308:	learn: 0.2879540	total: 3m 46s	remaining: 8m 26s
309:	learn: 0.2878798	total: 3m 47s	remaining: 8m 26s
310:	learn: 0.2878070	total: 3m 48s	remaining: 8m 26s
311:	learn: 0.2877361	total: 3m 49s	remaining: 8m 25s
312:	learn: 0.2876290	total: 3m 50s	remaining: 8m 25s
313:	learn: 0.2875614	total: 3m 50s	remaining: 8m 24s
314:	learn: 0.2874693	total: 3m 51s	remaining: 8m 23s
315:	learn: 0.2874043	total: 3m 52s	remaining: 8m 22s
316:	learn: 0.2873231	total: 3m 52s	remaining: 8m 21s
317:	learn: 0.2872499	total: 3m 53s	remaining: 8m 20s
318:	learn: 0.2871562	total: 3m 54s	remaining: 8m 19s
319:	learn: 0.2870578	total: 3m 54s	remaining: 8m 18s
320:	learn: 0.2869664	total: 3m 55s	remaining: 8m 18s
321:	learn: 0.2868758	total: 3m 56s	remaining: 8m 17s
322:	learn: 0.2867950	total: 3m 57s	remaining: 8m 16s
323:	learn: 0.2867373	total: 3m 57s	remaining: 8m 16s
324:	learn: 0.2866817	total:

459:	learn: 0.2769297	total: 5m 29s	remaining: 6m 26s
460:	learn: 0.2768920	total: 5m 29s	remaining: 6m 25s
461:	learn: 0.2768239	total: 5m 30s	remaining: 6m 24s
462:	learn: 0.2767356	total: 5m 31s	remaining: 6m 24s
463:	learn: 0.2766566	total: 5m 32s	remaining: 6m 23s
464:	learn: 0.2765813	total: 5m 32s	remaining: 6m 22s
465:	learn: 0.2764832	total: 5m 33s	remaining: 6m 22s
466:	learn: 0.2764229	total: 5m 34s	remaining: 6m 21s
467:	learn: 0.2763482	total: 5m 34s	remaining: 6m 20s
468:	learn: 0.2762632	total: 5m 35s	remaining: 6m 20s
469:	learn: 0.2762065	total: 5m 36s	remaining: 6m 19s
470:	learn: 0.2761204	total: 5m 37s	remaining: 6m 18s
471:	learn: 0.2760493	total: 5m 37s	remaining: 6m 17s
472:	learn: 0.2760044	total: 5m 38s	remaining: 6m 16s
473:	learn: 0.2759561	total: 5m 39s	remaining: 6m 16s
474:	learn: 0.2758998	total: 5m 39s	remaining: 6m 15s
475:	learn: 0.2758500	total: 5m 40s	remaining: 6m 14s
476:	learn: 0.2757759	total: 5m 40s	remaining: 6m 13s
477:	learn: 0.2757040	total:

613:	learn: 0.2666899	total: 7m 15s	remaining: 4m 34s
614:	learn: 0.2666370	total: 7m 16s	remaining: 4m 33s
615:	learn: 0.2665801	total: 7m 17s	remaining: 4m 32s
616:	learn: 0.2665029	total: 7m 18s	remaining: 4m 31s
617:	learn: 0.2664651	total: 7m 18s	remaining: 4m 31s
618:	learn: 0.2664161	total: 7m 19s	remaining: 4m 30s
619:	learn: 0.2663644	total: 7m 20s	remaining: 4m 29s
620:	learn: 0.2663009	total: 7m 21s	remaining: 4m 29s
621:	learn: 0.2662210	total: 7m 21s	remaining: 4m 28s
622:	learn: 0.2661701	total: 7m 22s	remaining: 4m 27s
623:	learn: 0.2660753	total: 7m 23s	remaining: 4m 26s
624:	learn: 0.2660185	total: 7m 23s	remaining: 4m 26s
625:	learn: 0.2659555	total: 7m 24s	remaining: 4m 25s
626:	learn: 0.2658905	total: 7m 24s	remaining: 4m 24s
627:	learn: 0.2658410	total: 7m 25s	remaining: 4m 23s
628:	learn: 0.2657399	total: 7m 26s	remaining: 4m 23s
629:	learn: 0.2656811	total: 7m 26s	remaining: 4m 22s
630:	learn: 0.2655874	total: 7m 27s	remaining: 4m 21s
631:	learn: 0.2655068	total:

766:	learn: 0.2573779	total: 9m 2s	remaining: 2m 44s
767:	learn: 0.2573170	total: 9m 2s	remaining: 2m 44s
768:	learn: 0.2572591	total: 9m 3s	remaining: 2m 43s
769:	learn: 0.2572068	total: 9m 4s	remaining: 2m 42s
770:	learn: 0.2571600	total: 9m 5s	remaining: 2m 41s
771:	learn: 0.2571097	total: 9m 5s	remaining: 2m 41s
772:	learn: 0.2570530	total: 9m 6s	remaining: 2m 40s
773:	learn: 0.2569941	total: 9m 7s	remaining: 2m 39s
774:	learn: 0.2569028	total: 9m 7s	remaining: 2m 39s
775:	learn: 0.2568298	total: 9m 8s	remaining: 2m 38s
776:	learn: 0.2567800	total: 9m 9s	remaining: 2m 37s
777:	learn: 0.2567370	total: 9m 10s	remaining: 2m 36s
778:	learn: 0.2566712	total: 9m 10s	remaining: 2m 36s
779:	learn: 0.2566292	total: 9m 11s	remaining: 2m 35s
780:	learn: 0.2565714	total: 9m 11s	remaining: 2m 34s
781:	learn: 0.2565115	total: 9m 12s	remaining: 2m 34s
782:	learn: 0.2564777	total: 9m 13s	remaining: 2m 33s
783:	learn: 0.2564178	total: 9m 13s	remaining: 2m 32s
784:	learn: 0.2563737	total: 9m 14s	rem

918:	learn: 0.2487597	total: 11m 17s	remaining: 59.7s
919:	learn: 0.2487088	total: 11m 18s	remaining: 59s
920:	learn: 0.2486634	total: 11m 18s	remaining: 58.2s
921:	learn: 0.2486010	total: 11m 19s	remaining: 57.5s
922:	learn: 0.2485611	total: 11m 20s	remaining: 56.8s
923:	learn: 0.2485083	total: 11m 21s	remaining: 56s
924:	learn: 0.2484406	total: 11m 22s	remaining: 55.3s
925:	learn: 0.2483969	total: 11m 23s	remaining: 54.6s
926:	learn: 0.2483395	total: 11m 23s	remaining: 53.8s
927:	learn: 0.2482591	total: 11m 24s	remaining: 53.1s
928:	learn: 0.2481866	total: 11m 25s	remaining: 52.4s
929:	learn: 0.2481357	total: 11m 25s	remaining: 51.6s
930:	learn: 0.2480910	total: 11m 26s	remaining: 50.9s
931:	learn: 0.2480248	total: 11m 27s	remaining: 50.2s
932:	learn: 0.2479702	total: 11m 28s	remaining: 49.4s
933:	learn: 0.2478965	total: 11m 28s	remaining: 48.7s
934:	learn: 0.2478453	total: 11m 29s	remaining: 47.9s
935:	learn: 0.2478099	total: 11m 30s	remaining: 47.2s
936:	learn: 0.2477754	total: 11m

<catboost.core.CatBoostClassifier at 0x7f097eb8cb10>

In [None]:
# y_test_pred = model.predict(X_test)
# f1_score(y_test, y_test_pred, average='macro')

In [166]:
y_valid = model.predict(X_valid)

## Сохраняем answers_test.csv

In [167]:
answers_test['target'] = y_valid

In [170]:
answers_test.to_csv('answers_test.csv', index=False,)

In [171]:
answers_test_loaded = pd.read_csv('answers_test.csv')

In [173]:
answers_test_loaded.head()

Unnamed: 0,id,vas_id,buy_time,target
0,3130519,2.0,1548018000,0.0
1,2000860,4.0,1548018000,1.0
2,1099444,2.0,1546808400,0.0
3,1343255,5.0,1547413200,0.0
4,1277040,2.0,1546808400,0.0


## Сохраняем модель.

In [174]:
import pickle

# save
with open('model.pkl','wb') as f:
    pickle.dump(model,f)


In [175]:
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [176]:
loaded_model.predict(X_valid)

array([0., 1., 0., ..., 0., 0., 0.])