In [1]:

import pandas as pd
import numpy as np
import luigi
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from datetime import datetime, date, time, timedelta
from functions import reduce_mem_usage

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix
from sklearn.metrics import precision_recall_curve, roc_curve, auc

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# from lightgbm import LGBMClassifier
#
# from imblearn.over_sampling import RandomOverSampler
# from collections import Counter
#
# from boruta import BorutaPy

import time

In [2]:
%matplotlib inline

warnings.simplefilter("ignore")
pd.set_option("display.max_columns", 999)

In [3]:
RANDOM_STATE = 42

In [4]:
TRAIN_PATH = "data/train_merge.csv"
TEST_PATH = "data/test_merge.csv"

In [None]:
train = reduce_mem_usage(pd.read_csv(TRAIN_PATH))
test = reduce_mem_usage(pd.read_csv(TEST_PATH))

In [None]:
train.head(20)

In [None]:
# train['interval'] = train['buy_time_y'] - train['buy_time_x']
# test['interval'] = test['buy_time_y'] - test['buy_time_x']

In [None]:
features = [f for f in train.columns if f not in ['target','id']]
len(features)

Разделим данные на X и y

In [None]:
X = train.drop('target', axis='columns')
y = train.target
X_valid = test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Разделим признаки на бинарные, категориальные и вещественные.

In [None]:
boolean_features = []
categorical_features = []
numeric_features = []
for col in X[features].fillna(0):
    val_count = len(X[col].unique())
    if val_count == 2:
        boolean_features.append(col)
    elif val_count <= 10:
        categorical_features.append(col)
    else:
        numeric_features.append(col)


Селектор колонок

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

Генератор новых фич
Поскольку данные обезличены и их много, скорее для примера.
Датасеты имеют две колонки даты - подключения услуги и некого другого действия (покупки сим карты?). Вероятно, имеет смысл сгенерировать новый признак как разницу между этими значениями. Условно, время с подключения сим до подключения услуги.

In [None]:
new_features_list = ['interval']

In [None]:
class FeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # assert isinstance(X, pd.DataFrame)

        try:
            if 'interval' in self.features_list:
                X['interval'] = X['buy_time_y'] - X['buy_time_x']

            return X
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

Обработчик вещественных признаков

In [None]:
num_pipe = Pipeline([
    ('ncs', ColumnSelector(columns=numeric_features)),
    ('nsi', SimpleImputer(strategy="mean")),
    ('nss', StandardScaler()),
])

Обработчик категориальных признаков

In [None]:
cat_pipe = Pipeline([
    ('ccs', ColumnSelector(columns=categorical_features)),
    ('csi', SimpleImputer(strategy="most_frequent")),
    ('coe', OneHotEncoder(handle_unknown='ignore')),
])

Обработчик булевых признаков

In [None]:
bool_pipe = Pipeline([
    ('bcs', ColumnSelector(columns=boolean_features)),
    ('bsi', SimpleImputer(strategy="most_frequent")),
])

Собираем в общий пайплайн

In [None]:
transformer_list = [('num_pipe', num_pipe), ('cat_pipe', cat_pipe), ('bool_pipe', bool_pipe)]

In [None]:
transform_pipe = Pipeline([
    ('cs', ColumnSelector(columns=features)),
    ('fg', FeaturesGenerator(features_list=['interval'])),
    ('fu', FeatureUnion(transformer_list=transformer_list)),
])


Отберем признаки с помощью SelectKBest и логистической регрессии с регуляризацией L1 (было 298 признаков)

upd: Логистическая регрессия, даже с большим трешхолдом занулила всего 4 признака, однако сильно увеличила время работы пайплайна, поэтому принял решение ее исключить.

In [None]:
fs_pipe = make_pipeline(
    transform_pipe,
    SelectKBest(k=50, score_func=f_classif),
    # SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=RANDOM_STATE), threshold=1e-3),
)

In [None]:
from sklearn import set_config

set_config(display='diagram')

fs_pipe

In [None]:
# fs_pipe.fit(X_train, y_train)


In [None]:
# fs_pipe.transform(X_test).shape[1]

In [None]:
joblib.dump(fs_pipe, 'models/data_pipeline.pkl', compress=9)

In [None]:
# pipe_clone = joblib.load('models/data_pipeline.pkl')

In [None]:
# type(pipe_clone)