In [1]:
import pandas as pd
import numpy as np
import luigi
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datetime import datetime, date, time, timedelta
from sklearn import set_config

from functions import reduce_mem_usage

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn #, Pipeline
from collections import Counter


import time

In [2]:
%matplotlib inline

warnings.simplefilter("ignore")
pd.set_option("display.max_columns", 999)

In [3]:
RANDOM_STATE = 42

In [4]:
TRAIN_PATH = "data/data_train.csv"
TEST_PATH = "data/data_test.csv"
FEATURES_PATH = "data/features.csv"

data_train = pd.read_csv(TRAIN_PATH)
data_test = pd.read_csv(TEST_PATH)

In [5]:
data_train = reduce_mem_usage(data_train)
data_test = reduce_mem_usage(data_test)

Memory usage of the dataframe is 31.73 MB
Memory usage after optimization is: 15.86 MB
Decreased by 50.0%
Memory usage of the dataframe is 2.17 MB
Memory usage after optimization is: 1.09 MB
Decreased by 50.0%


In [47]:
answers_test = data_test

answers_test.shape

(71231, 3)

In [6]:
def process_featues(data_features, train, test):
    ids = np.unique(train['id'].append(test['id']))
    # Возьмем id только тех пользователей, что встречаются в трейн и тест выборках.
    data_features = data_features[data_features['id'].isin(ids)]
    # Дропнем Unnamed
    if 'Unnamed: 0' in data_features:
        data_features = data_features.drop(columns='Unnamed: 0')

    data_features = data_features.compute()
    # удалим признаки с единственным значением
    df_nunique = data_features.apply(lambda x: x.nunique(dropna=False))
    const = df_nunique[df_nunique ==1].index.tolist()
    data_features = data_features.drop(columns = const)
    # функция сжатия данных
    data_features = reduce_mem_usage(data_features)
    return data_features

In [7]:
import dask.dataframe as dd

data_features = dd.read_csv(FEATURES_PATH, sep ='\t')

In [8]:
data_features = process_featues(data_features, data_train, data_test)

Memory usage of the dataframe is 1728.06 MB
Memory usage after optimization is: 867.47 MB
Decreased by 49.8%


Мерджим данные по ближайшей дате. Возникает вопрос, как указать direction

Логично сделать direction='forward' чтобы время в data_features было не больше чем в выборках трейн и тест. Ведь мы не можем знать профиль пользователя "из будущего" при работе модели.

С другой стороны, direction='nearest' выбирает ближайшие даты. При обучении это будет более "свежий" профиль и даже если он "из будущего" - лучше характеризует пользователя в момент получения предложения подключить услугу. При реальном же использовании модели случаи получения фич "из будущего" будут исключены по-определению, ближайший доступный профиль априори будет иметь более раннюю дату, и подобные ситуации не будут возникать. Поэтому выбран данный способ объединения.

При этом я не знаю, какие именно признаки находятся в data_features, если точно известно что там отражено подключил ли юзер искомую услугу, то значимость данных признаков неоправданно вырастет, т.к. по сути они будут готовым ответом. Но реальной предсказательной силы они иметь не будут. В таком случае допустимо использовать только forward объединение. 

In [9]:
data_train = data_train.sort_values(by="buy_time")
data_test = data_test.sort_values(by="buy_time")
data_features = data_features.sort_values(by="buy_time")

In [36]:
train = pd.merge_asof(data_train, data_features, on='buy_time', by='id', direction='nearest')
valid = pd.merge_asof(data_test, data_features, on='buy_time', by='id', direction ='nearest')

In [37]:
del train['Unnamed: 0']
del valid['Unnamed: 0']

## data pipe

In [44]:
features = [f for f in valid.columns if f not in ['id']]

Разделим данные на X и y

In [45]:
X_train = train.drop('target', axis='columns')
y_train = train.target
X_valid = valid

Сохраним необходимые для ответа признаки в отдельную переменную, чтобы позже добавить target.

Разделим признаки на бинарные, категориальные и вещественные.

In [48]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X_train[features].fillna(0):
    val_count = len(X_train[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


Селектор колонок

In [49]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [50]:
# new_features_list = ['interval']
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)

        try:
#             if 'interval' in self.features_list:
#                 X['interval'] = X['buy_time_y'] - X['buy_time_x']
            

            return X
        except KeyError:            
            raise KeyError("DataFrame не содердит колонки buy_time")

Обработчик вещественных признаков
Поскольку вещественных признаков много больше чем предполагается оставить в конце, и чем остальных признаков, откинем менее значимые из них до объединения с другими признаками.

In [51]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
    ('nskb', SelectKBest(k=128, score_func=f_classif)),
])

Обработчик категориальных признаков

In [52]:
cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

Обработчик булевых признаков

In [53]:
bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

Собираем в общий пайплайн

In [54]:
transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

In [55]:
transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


Отберем признаки с помощью SelectKBest и логистической регрессии с регуляризацией L1 (было 298 признаков)


In [56]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=64, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [57]:
set_config(display='diagram')

fs_pipe

In [58]:
fs_pipe.fit(X_train, y_train)


In [64]:
X_train = fs_pipe.transform(X_train)

In [65]:
X_valid = fs_pipe.transform(X_valid)

## Model

Соберем финальный пайплайн.


In [75]:
ros = RandomOverSampler(random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [76]:
model = CatBoostClassifier(random_state=RANDOM_STATE)


In [77]:
model.fit(X_ros, y_ros)


Learning rate set to 0.236787
0:	learn: 0.4857543	total: 522ms	remaining: 8m 41s
1:	learn: 0.4077319	total: 839ms	remaining: 6m 58s
2:	learn: 0.3685533	total: 1.14s	remaining: 6m 19s
3:	learn: 0.3471900	total: 1.45s	remaining: 6m 2s
4:	learn: 0.3361187	total: 1.77s	remaining: 5m 51s
5:	learn: 0.3281733	total: 2.08s	remaining: 5m 44s
6:	learn: 0.3246187	total: 2.39s	remaining: 5m 39s
7:	learn: 0.3225191	total: 2.66s	remaining: 5m 29s
8:	learn: 0.3211967	total: 2.93s	remaining: 5m 22s
9:	learn: 0.3204051	total: 3.19s	remaining: 5m 15s
10:	learn: 0.3198662	total: 3.45s	remaining: 5m 10s
11:	learn: 0.3193046	total: 3.76s	remaining: 5m 9s
12:	learn: 0.3183402	total: 4.04s	remaining: 5m 6s
13:	learn: 0.3180816	total: 4.32s	remaining: 5m 4s
14:	learn: 0.3177915	total: 4.61s	remaining: 5m 2s
15:	learn: 0.3174058	total: 4.92s	remaining: 5m 2s
16:	learn: 0.3172682	total: 5.21s	remaining: 5m 1s
17:	learn: 0.3171261	total: 5.52s	remaining: 5m 1s
18:	learn: 0.3169894	total: 5.77s	remaining: 4m 58s


158:	learn: 0.3035446	total: 44.2s	remaining: 3m 53s
159:	learn: 0.3034608	total: 44.5s	remaining: 3m 53s
160:	learn: 0.3033690	total: 44.7s	remaining: 3m 53s
161:	learn: 0.3032312	total: 45.1s	remaining: 3m 53s
162:	learn: 0.3031582	total: 45.3s	remaining: 3m 52s
163:	learn: 0.3030821	total: 45.6s	remaining: 3m 52s
164:	learn: 0.3030262	total: 45.9s	remaining: 3m 52s
165:	learn: 0.3029626	total: 46.1s	remaining: 3m 51s
166:	learn: 0.3028947	total: 46.4s	remaining: 3m 51s
167:	learn: 0.3028172	total: 46.7s	remaining: 3m 51s
168:	learn: 0.3027148	total: 47.1s	remaining: 3m 51s
169:	learn: 0.3026282	total: 47.4s	remaining: 3m 51s
170:	learn: 0.3025607	total: 47.6s	remaining: 3m 50s
171:	learn: 0.3024672	total: 47.9s	remaining: 3m 50s
172:	learn: 0.3023676	total: 48.2s	remaining: 3m 50s
173:	learn: 0.3022995	total: 48.4s	remaining: 3m 49s
174:	learn: 0.3022193	total: 48.7s	remaining: 3m 49s
175:	learn: 0.3021549	total: 49s	remaining: 3m 49s
176:	learn: 0.3020456	total: 49.2s	remaining: 3m

312:	learn: 0.2919976	total: 1m 27s	remaining: 3m 12s
313:	learn: 0.2919242	total: 1m 27s	remaining: 3m 12s
314:	learn: 0.2918458	total: 1m 28s	remaining: 3m 11s
315:	learn: 0.2917627	total: 1m 28s	remaining: 3m 11s
316:	learn: 0.2916935	total: 1m 28s	remaining: 3m 11s
317:	learn: 0.2916425	total: 1m 29s	remaining: 3m 11s
318:	learn: 0.2915902	total: 1m 29s	remaining: 3m 10s
319:	learn: 0.2915317	total: 1m 29s	remaining: 3m 10s
320:	learn: 0.2914401	total: 1m 29s	remaining: 3m 10s
321:	learn: 0.2913559	total: 1m 30s	remaining: 3m 10s
322:	learn: 0.2912802	total: 1m 30s	remaining: 3m 9s
323:	learn: 0.2912009	total: 1m 30s	remaining: 3m 9s
324:	learn: 0.2911219	total: 1m 31s	remaining: 3m 9s
325:	learn: 0.2910403	total: 1m 31s	remaining: 3m 8s
326:	learn: 0.2909727	total: 1m 31s	remaining: 3m 8s
327:	learn: 0.2909280	total: 1m 31s	remaining: 3m 8s
328:	learn: 0.2908723	total: 1m 32s	remaining: 3m 8s
329:	learn: 0.2908205	total: 1m 32s	remaining: 3m 7s
330:	learn: 0.2907365	total: 1m 32s	

466:	learn: 0.2820859	total: 2m 12s	remaining: 2m 30s
467:	learn: 0.2820377	total: 2m 12s	remaining: 2m 30s
468:	learn: 0.2819963	total: 2m 12s	remaining: 2m 30s
469:	learn: 0.2819510	total: 2m 12s	remaining: 2m 29s
470:	learn: 0.2818586	total: 2m 13s	remaining: 2m 29s
471:	learn: 0.2817765	total: 2m 13s	remaining: 2m 29s
472:	learn: 0.2817087	total: 2m 13s	remaining: 2m 28s
473:	learn: 0.2816359	total: 2m 13s	remaining: 2m 28s
474:	learn: 0.2815860	total: 2m 14s	remaining: 2m 28s
475:	learn: 0.2815155	total: 2m 14s	remaining: 2m 28s
476:	learn: 0.2814643	total: 2m 14s	remaining: 2m 27s
477:	learn: 0.2814223	total: 2m 15s	remaining: 2m 27s
478:	learn: 0.2813491	total: 2m 15s	remaining: 2m 27s
479:	learn: 0.2812899	total: 2m 15s	remaining: 2m 27s
480:	learn: 0.2812628	total: 2m 15s	remaining: 2m 26s
481:	learn: 0.2812176	total: 2m 16s	remaining: 2m 26s
482:	learn: 0.2811833	total: 2m 16s	remaining: 2m 26s
483:	learn: 0.2811109	total: 2m 16s	remaining: 2m 25s
484:	learn: 0.2810347	total:

619:	learn: 0.2730414	total: 3m	remaining: 1m 50s
620:	learn: 0.2729780	total: 3m	remaining: 1m 50s
621:	learn: 0.2729331	total: 3m	remaining: 1m 49s
622:	learn: 0.2728706	total: 3m 1s	remaining: 1m 49s
623:	learn: 0.2728257	total: 3m 1s	remaining: 1m 49s
624:	learn: 0.2727623	total: 3m 1s	remaining: 1m 49s
625:	learn: 0.2727086	total: 3m 2s	remaining: 1m 48s
626:	learn: 0.2726511	total: 3m 2s	remaining: 1m 48s
627:	learn: 0.2725690	total: 3m 2s	remaining: 1m 48s
628:	learn: 0.2725302	total: 3m 3s	remaining: 1m 48s
629:	learn: 0.2724844	total: 3m 3s	remaining: 1m 47s
630:	learn: 0.2724417	total: 3m 3s	remaining: 1m 47s
631:	learn: 0.2723808	total: 3m 3s	remaining: 1m 47s
632:	learn: 0.2723364	total: 3m 4s	remaining: 1m 46s
633:	learn: 0.2722949	total: 3m 4s	remaining: 1m 46s
634:	learn: 0.2722314	total: 3m 4s	remaining: 1m 46s
635:	learn: 0.2721695	total: 3m 5s	remaining: 1m 45s
636:	learn: 0.2721409	total: 3m 5s	remaining: 1m 45s
637:	learn: 0.2720887	total: 3m 5s	remaining: 1m 45s
63

772:	learn: 0.2650717	total: 3m 46s	remaining: 1m 6s
773:	learn: 0.2650057	total: 3m 47s	remaining: 1m 6s
774:	learn: 0.2649627	total: 3m 47s	remaining: 1m 6s
775:	learn: 0.2649071	total: 3m 47s	remaining: 1m 5s
776:	learn: 0.2648696	total: 3m 47s	remaining: 1m 5s
777:	learn: 0.2647944	total: 3m 48s	remaining: 1m 5s
778:	learn: 0.2647631	total: 3m 48s	remaining: 1m 4s
779:	learn: 0.2647012	total: 3m 48s	remaining: 1m 4s
780:	learn: 0.2646470	total: 3m 49s	remaining: 1m 4s
781:	learn: 0.2646049	total: 3m 49s	remaining: 1m 3s
782:	learn: 0.2645466	total: 3m 49s	remaining: 1m 3s
783:	learn: 0.2644845	total: 3m 49s	remaining: 1m 3s
784:	learn: 0.2644362	total: 3m 50s	remaining: 1m 3s
785:	learn: 0.2643614	total: 3m 50s	remaining: 1m 2s
786:	learn: 0.2643281	total: 3m 50s	remaining: 1m 2s
787:	learn: 0.2642864	total: 3m 50s	remaining: 1m 2s
788:	learn: 0.2642288	total: 3m 51s	remaining: 1m 1s
789:	learn: 0.2641778	total: 3m 51s	remaining: 1m 1s
790:	learn: 0.2641259	total: 3m 51s	remaining:

929:	learn: 0.2572231	total: 4m 32s	remaining: 20.5s
930:	learn: 0.2571616	total: 4m 33s	remaining: 20.2s
931:	learn: 0.2570899	total: 4m 33s	remaining: 20s
932:	learn: 0.2570505	total: 4m 33s	remaining: 19.7s
933:	learn: 0.2570006	total: 4m 34s	remaining: 19.4s
934:	learn: 0.2569585	total: 4m 34s	remaining: 19.1s
935:	learn: 0.2569109	total: 4m 34s	remaining: 18.8s
936:	learn: 0.2568927	total: 4m 34s	remaining: 18.5s
937:	learn: 0.2568590	total: 4m 35s	remaining: 18.2s
938:	learn: 0.2568234	total: 4m 35s	remaining: 17.9s
939:	learn: 0.2567599	total: 4m 35s	remaining: 17.6s
940:	learn: 0.2567164	total: 4m 36s	remaining: 17.3s
941:	learn: 0.2566831	total: 4m 36s	remaining: 17s
942:	learn: 0.2566229	total: 4m 36s	remaining: 16.7s
943:	learn: 0.2565644	total: 4m 36s	remaining: 16.4s
944:	learn: 0.2565253	total: 4m 37s	remaining: 16.1s
945:	learn: 0.2564858	total: 4m 37s	remaining: 15.8s
946:	learn: 0.2564407	total: 4m 37s	remaining: 15.5s
947:	learn: 0.2563823	total: 4m 38s	remaining: 15.

<catboost.core.CatBoostClassifier at 0x7fdc81f10c90>

In [79]:
y_valid = model.predict(X_valid)

## Сохраняем answers_test.csv

In [80]:
answers_test['target'] = y_valid

In [81]:
answers_test.to_csv('answers_test_1.csv', index=False,)

In [82]:
answers_test_loaded = pd.read_csv('answers_test_1.csv')

In [83]:
answers_test_loaded.head()

Unnamed: 0,id,vas_id,buy_time,target
0,2905850,5.0,1546808400,0.0
1,31619,2.0,1546808400,0.0
2,1427271,6.0,1546808400,1.0
3,2162521,6.0,1546808400,1.0
4,1529304,6.0,1546808400,1.0


## Сохраняем модель.

In [84]:
import pickle


# save
with open('fs_pipe.pkl','wb') as f:
    pickle.dump(fs_pipe,f)

with open('model.pkl','wb') as f:
    pickle.dump(model,f)


In [85]:
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [86]:
loaded_model.predict(X_valid)

array([0., 0., 1., ..., 0., 1., 0.])

## Сравниваем результат отработки модели из ноутбука с luigi пайплайном

In [87]:
answers_test_luigi = pd.read_csv('answers_test.csv')

In [95]:
answers_test.where(answers_test.values==answers_test_luigi.values).notna()['target'].unique()


array([ True])