In [58]:
import pandas as pd
import numpy as np
import luigi
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datetime import datetime, date, time, timedelta
from sklearn import set_config

from functions import reduce_mem_usage

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from collections import Counter


import time

In [59]:
%matplotlib inline

warnings.simplefilter("ignore")
pd.set_option("display.max_columns", 999)

In [60]:
RANDOM_STATE = 42

In [61]:
TRAIN_PATH = "data/data_train.csv"
TEST_PATH = "data/data_test.csv"
FEATURES_PATH = "data/features.csv"

data_train = pd.read_csv(TRAIN_PATH)
data_test = pd.read_csv(TEST_PATH)

In [62]:
data_train = reduce_mem_usage(data_train)
data_test = reduce_mem_usage(data_test)

Memory usage of the dataframe is 31.73 MB
Memory usage after optimization is: 15.86 MB
Decreased by 50.0%
Memory usage of the dataframe is 2.17 MB
Memory usage after optimization is: 1.09 MB
Decreased by 50.0%


In [63]:
def process_featues(data_features, train, test):
    ids = np.unique(train['id'].append(test['id']))
    # Возьмем id только тех пользователей, что встречаются в трейн и тест выборках.
    data_features = data_features[data_features['id'].isin(ids)]
    # Дропнем Unnamed
    if 'Unnamed: 0' in data_features:
        data_features = data_features.drop(columns='Unnamed: 0')

    data_features = data_features.compute()
    # удалим признаки с единственным значением
    df_nunique = data_features.apply(lambda x: x.nunique(dropna=False))
    const = df_nunique[df_nunique ==1].index.tolist()
    data_features = data_features.drop(columns = const)
    # функция сжатия данных
    data_features = reduce_mem_usage(data_features)
    return data_features

In [None]:
import dask.dataframe as dd

data_features = dd.read_csv(FEATURES_PATH, sep ='\t')

data_features =process_featues(data_features, data_train, data_test)

In [None]:
train = data_train.merge(data_features, on='id', how = 'left')
valid = data_test.merge(data_features, on='id', how = 'left')
del train['Unnamed: 0']
del valid['Unnamed: 0']

## data pipe

In [None]:
# TRAIN_PATH = "data/train_merge.csv"
# TEST_PATH = "data/test_merge.csv"

In [None]:
# train = reduce_mem_usage(pd.read_csv(TRAIN_PATH))
# test = reduce_mem_usage(pd.read_csv(TEST_PATH))

In [None]:
features = [f for f in data_features.columns if f not in ['buy_time','id']]

Разделим данные на X и y

In [None]:
X_train = train.drop('target', axis='columns')
y_train = train.target
X_valid = valid

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Разделим признаки на бинарные, категориальные и вещественные.

In [None]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X_train[features].fillna(0):
    val_count = len(X_train[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


Селектор колонок

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [None]:
# new_features_list = ['interval']
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # assert isinstance(X, pd.DataFrame)

        try:
#             if 'interval' in self.features_list:
#                 X['interval'] = X['buy_time_y'] - X['buy_time_x']

            return X
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

Обработчик вещественных признаков
Поскольку вещественных признаков много больше чем предполагается оставить в конце, и чем остальных признаков, откинем менее значимые из них до объединения с другими признаками.

In [None]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
    ('nskb', SelectKBest(k=128, score_func=f_classif)),
])

Обработчик категориальных признаков

In [None]:
cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

Обработчик булевых признаков

In [None]:
bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

Собираем в общий пайплайн

In [None]:
transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

In [None]:
transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


Отберем признаки с помощью SelectKBest и логистической регрессии с регуляризацией L1 (было 298 признаков)


In [None]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=64, score_func=f_classif),
    SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [None]:
set_config(display='diagram')

fs_pipe

In [None]:
fs_pipe.fit(X_train, y_train)


In [None]:
# fs_pipe.transform(X_test).shape[1]

In [None]:
# joblib.dump(fs_pipe, 'models/data_pipeline.pkl', compress=9)

In [None]:
# pipe_clone = joblib.load('models/data_pipeline.pkl')

In [None]:
# type(pipe_clone)

In [None]:
# if X_test:
#     X_test = fs_pipe.transform(X_test)

In [None]:
X_train = fs_pipe.transform(X_train)

In [None]:
answers_test = X_valid['id', 'vas_id']
answers_test['buy_time'] = X_valid['buy_time_x']

In [None]:
answers_test

In [46]:
X_valid = fs_pipe.transform(X_valid)

In [None]:
# type(X_train_transform)

In [None]:
# X_train_transform.shape

In [None]:
# joblib.dump(X_train_transform, 'data/changes/X_train_transform.pkl', compress=9)
# joblib.dump(X_test_transform, 'data/changes/X_test_transform.pkl', compress=9)
# joblib.dump(X_valid_transform, 'data/changes/X_valid_transform.pkl', compress=9)

# joblib.dump(y_train, 'data/changes/y_train.pkl', compress=9)
# joblib.dump(y_test, 'data/changes/y_test.pkl', compress=9)

## Model

In [47]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred, model, X_test, digits=3):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred, digits=digits))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred, digits=digits))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))
    plot_confusion_matrix(model, X_test, y_test_true, cmap=plt.cm.Blues, normalize='all')

In [48]:
def test_model(X_train, y_train, model):
    start = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)
    get_classification_report(y_train, y_train_pred, y_test, y_test_pred, model, X_test)
    train_time = time.time() - start
    print(f"train time = {train_time}")
    return model, y_test, y_test_pred_proba, f1_score(y_test, y_test_pred, average='macro')

In [49]:
ros = RandomOverSampler(random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [50]:
model = CatBoostClassifier(random_state=RANDOM_STATE)


In [51]:
test_model(X_ros, y_ros, model)
y_test_pred = model.predict(X_test)
f1_score(y_test, y_test_pred, average='macro')

Learning rate set to 0.240211
0:	learn: 0.6925347	total: 596ms	remaining: 9m 55s
1:	learn: 0.6923098	total: 884ms	remaining: 7m 21s
2:	learn: 0.6921480	total: 1.19s	remaining: 6m 36s
3:	learn: 0.6919847	total: 1.47s	remaining: 6m 6s
4:	learn: 0.6918258	total: 1.74s	remaining: 5m 47s
5:	learn: 0.6916496	total: 2.06s	remaining: 5m 41s
6:	learn: 0.6915250	total: 2.33s	remaining: 5m 31s
7:	learn: 0.6913974	total: 2.87s	remaining: 5m 55s
8:	learn: 0.6912731	total: 3.3s	remaining: 6m 3s
9:	learn: 0.6911514	total: 3.75s	remaining: 6m 11s
10:	learn: 0.6910342	total: 4.15s	remaining: 6m 13s
11:	learn: 0.6909135	total: 4.54s	remaining: 6m 14s
12:	learn: 0.6907964	total: 5s	remaining: 6m 19s
13:	learn: 0.6906616	total: 5.57s	remaining: 6m 31s
14:	learn: 0.6905594	total: 6.03s	remaining: 6m 36s
15:	learn: 0.6904617	total: 6.38s	remaining: 6m 32s
16:	learn: 0.6903337	total: 6.98s	remaining: 6m 43s
17:	learn: 0.6902408	total: 7.69s	remaining: 6m 59s
18:	learn: 0.6901680	total: 8.26s	remaining: 7m 6s

157:	learn: 0.6756252	total: 1m 24s	remaining: 7m 27s
158:	learn: 0.6755147	total: 1m 24s	remaining: 7m 26s
159:	learn: 0.6754064	total: 1m 24s	remaining: 7m 25s
160:	learn: 0.6753081	total: 1m 25s	remaining: 7m 23s
161:	learn: 0.6752195	total: 1m 25s	remaining: 7m 22s
162:	learn: 0.6751035	total: 1m 25s	remaining: 7m 20s
163:	learn: 0.6749868	total: 1m 26s	remaining: 7m 19s
164:	learn: 0.6748851	total: 1m 26s	remaining: 7m 18s
165:	learn: 0.6747958	total: 1m 27s	remaining: 7m 18s
166:	learn: 0.6746808	total: 1m 27s	remaining: 7m 18s
167:	learn: 0.6746070	total: 1m 28s	remaining: 7m 17s
168:	learn: 0.6745138	total: 1m 29s	remaining: 7m 17s
169:	learn: 0.6744075	total: 1m 29s	remaining: 7m 17s
170:	learn: 0.6743241	total: 1m 30s	remaining: 7m 17s
171:	learn: 0.6742313	total: 1m 30s	remaining: 7m 17s
172:	learn: 0.6741479	total: 1m 31s	remaining: 7m 15s
173:	learn: 0.6740465	total: 1m 31s	remaining: 7m 13s
174:	learn: 0.6739640	total: 1m 31s	remaining: 7m 12s
175:	learn: 0.6738659	total:

311:	learn: 0.6610150	total: 2m 14s	remaining: 4m 55s
312:	learn: 0.6609398	total: 2m 14s	remaining: 4m 54s
313:	learn: 0.6608353	total: 2m 14s	remaining: 4m 54s
314:	learn: 0.6607657	total: 2m 14s	remaining: 4m 53s
315:	learn: 0.6606736	total: 2m 15s	remaining: 4m 52s
316:	learn: 0.6605529	total: 2m 15s	remaining: 4m 52s
317:	learn: 0.6604898	total: 2m 15s	remaining: 4m 51s
318:	learn: 0.6603941	total: 2m 16s	remaining: 4m 50s
319:	learn: 0.6603222	total: 2m 16s	remaining: 4m 50s
320:	learn: 0.6602110	total: 2m 16s	remaining: 4m 49s
321:	learn: 0.6601214	total: 2m 17s	remaining: 4m 48s
322:	learn: 0.6600454	total: 2m 17s	remaining: 4m 48s
323:	learn: 0.6599520	total: 2m 17s	remaining: 4m 47s
324:	learn: 0.6598624	total: 2m 18s	remaining: 4m 47s
325:	learn: 0.6597745	total: 2m 18s	remaining: 4m 46s
326:	learn: 0.6596686	total: 2m 18s	remaining: 4m 46s
327:	learn: 0.6596217	total: 2m 19s	remaining: 4m 45s
328:	learn: 0.6595160	total: 2m 19s	remaining: 4m 44s
329:	learn: 0.6594600	total:

464:	learn: 0.6481247	total: 2m 59s	remaining: 3m 26s
465:	learn: 0.6480415	total: 2m 59s	remaining: 3m 26s
466:	learn: 0.6479060	total: 3m	remaining: 3m 25s
467:	learn: 0.6478224	total: 3m	remaining: 3m 25s
468:	learn: 0.6477243	total: 3m	remaining: 3m 24s
469:	learn: 0.6476559	total: 3m 1s	remaining: 3m 24s
470:	learn: 0.6475738	total: 3m 1s	remaining: 3m 23s
471:	learn: 0.6475348	total: 3m 1s	remaining: 3m 23s
472:	learn: 0.6474534	total: 3m 1s	remaining: 3m 22s
473:	learn: 0.6473694	total: 3m 2s	remaining: 3m 22s
474:	learn: 0.6473043	total: 3m 2s	remaining: 3m 21s
475:	learn: 0.6471968	total: 3m 2s	remaining: 3m 21s
476:	learn: 0.6471238	total: 3m 3s	remaining: 3m 20s
477:	learn: 0.6470469	total: 3m 3s	remaining: 3m 20s
478:	learn: 0.6469721	total: 3m 3s	remaining: 3m 19s
479:	learn: 0.6469040	total: 3m 3s	remaining: 3m 19s
480:	learn: 0.6468135	total: 3m 4s	remaining: 3m 18s
481:	learn: 0.6467274	total: 3m 4s	remaining: 3m 18s
482:	learn: 0.6466171	total: 3m 4s	remaining: 3m 17s


618:	learn: 0.6357900	total: 3m 45s	remaining: 2m 18s
619:	learn: 0.6357332	total: 3m 45s	remaining: 2m 18s
620:	learn: 0.6356628	total: 3m 45s	remaining: 2m 17s
621:	learn: 0.6355834	total: 3m 46s	remaining: 2m 17s
622:	learn: 0.6355139	total: 3m 46s	remaining: 2m 17s
623:	learn: 0.6354567	total: 3m 46s	remaining: 2m 16s
624:	learn: 0.6353778	total: 3m 47s	remaining: 2m 16s
625:	learn: 0.6353034	total: 3m 47s	remaining: 2m 15s
626:	learn: 0.6352225	total: 3m 47s	remaining: 2m 15s
627:	learn: 0.6351491	total: 3m 47s	remaining: 2m 15s
628:	learn: 0.6350684	total: 3m 48s	remaining: 2m 14s
629:	learn: 0.6349872	total: 3m 48s	remaining: 2m 14s
630:	learn: 0.6349220	total: 3m 48s	remaining: 2m 13s
631:	learn: 0.6348367	total: 3m 49s	remaining: 2m 13s
632:	learn: 0.6347554	total: 3m 49s	remaining: 2m 12s
633:	learn: 0.6346559	total: 3m 49s	remaining: 2m 12s
634:	learn: 0.6345834	total: 3m 50s	remaining: 2m 12s
635:	learn: 0.6345076	total: 3m 50s	remaining: 2m 11s
636:	learn: 0.6344330	total:

772:	learn: 0.6246622	total: 4m 31s	remaining: 1m 19s
773:	learn: 0.6245868	total: 4m 32s	remaining: 1m 19s
774:	learn: 0.6245044	total: 4m 32s	remaining: 1m 19s
775:	learn: 0.6244181	total: 4m 32s	remaining: 1m 18s
776:	learn: 0.6243333	total: 4m 33s	remaining: 1m 18s
777:	learn: 0.6242612	total: 4m 33s	remaining: 1m 18s
778:	learn: 0.6241854	total: 4m 33s	remaining: 1m 17s
779:	learn: 0.6241167	total: 4m 34s	remaining: 1m 17s
780:	learn: 0.6240757	total: 4m 34s	remaining: 1m 16s
781:	learn: 0.6239977	total: 4m 34s	remaining: 1m 16s
782:	learn: 0.6239221	total: 4m 35s	remaining: 1m 16s
783:	learn: 0.6238433	total: 4m 35s	remaining: 1m 15s
784:	learn: 0.6237750	total: 4m 35s	remaining: 1m 15s
785:	learn: 0.6237274	total: 4m 35s	remaining: 1m 15s
786:	learn: 0.6236566	total: 4m 36s	remaining: 1m 14s
787:	learn: 0.6235920	total: 4m 36s	remaining: 1m 14s
788:	learn: 0.6235221	total: 4m 36s	remaining: 1m 14s
789:	learn: 0.6234533	total: 4m 37s	remaining: 1m 13s
790:	learn: 0.6233624	total:

928:	learn: 0.6138142	total: 5m 20s	remaining: 24.5s
929:	learn: 0.6137407	total: 5m 20s	remaining: 24.1s
930:	learn: 0.6136605	total: 5m 20s	remaining: 23.8s
931:	learn: 0.6135777	total: 5m 21s	remaining: 23.4s
932:	learn: 0.6135170	total: 5m 21s	remaining: 23.1s
933:	learn: 0.6134641	total: 5m 21s	remaining: 22.7s
934:	learn: 0.6133808	total: 5m 22s	remaining: 22.4s
935:	learn: 0.6133121	total: 5m 22s	remaining: 22s
936:	learn: 0.6132374	total: 5m 22s	remaining: 21.7s
937:	learn: 0.6131681	total: 5m 23s	remaining: 21.4s
938:	learn: 0.6130924	total: 5m 23s	remaining: 21s
939:	learn: 0.6130486	total: 5m 23s	remaining: 20.7s
940:	learn: 0.6129878	total: 5m 23s	remaining: 20.3s
941:	learn: 0.6129374	total: 5m 24s	remaining: 20s
942:	learn: 0.6128863	total: 5m 24s	remaining: 19.6s
943:	learn: 0.6128419	total: 5m 24s	remaining: 19.3s
944:	learn: 0.6127739	total: 5m 25s	remaining: 18.9s
945:	learn: 0.6126827	total: 5m 25s	remaining: 18.6s
946:	learn: 0.6126170	total: 5m 25s	remaining: 18.2s

NameError: name 'X_test' is not defined

In [None]:
y_valid = model.predict(X_valid)

## answers_test.csv

In [56]:
X_valid['buy_time', 'id', 'vas_id']

<73666x59 sparse matrix of type '<class 'numpy.float64'>'
	with 4272629 stored elements in Compressed Sparse Row format>

In [None]:
answers_test['target'] = y_valid

In [52]:
answers_test.head()

NameError: name 'answers_test' is not defined