In [1]:
!pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_validate
from sklearn import metrics

In [3]:
data = pd.read_csv('train-data.csv')

# Удалим пустые колонки и индексы.  
data = data.dropna(axis=1, how='all').drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
0,,,,,,714.0,7.0,,,,...,ROeipLp,zCkv,K2SqEo9,jySVZNlOJy,,,WqMG,6fzt,am14IcfM7tWLrUmRT52KtA,
1,,,,,,3059.0,7.0,,,,...,ZV0mFX7,oslk,1E9D3Yd,jySVZNlOJy,,ELof,5Acm,RAYp,F2FyR07IdsN7I,am7c
2,,,,,,4956.0,7.0,,,,...,CBA87dl,oslk,TX2AGfT,jySVZNlOJy,,,kwS7,RAYp,F2FyR07IdsN7I,
3,,,,,,7630.0,7.0,,,,...,4UxGlow,oslk,catzS2D,LM8l689qOp,,ELof,WqMG,ZI9m,ib5G6X1eUxUn6,am7c
4,,,,,,1022.0,7.0,,,,...,cXsjB1v,oslk,qWjjxQb,M_8D,,ELof,szEZ,RAYp,F2FyR07IdsN7I,am7c


In [4]:
num_features = data.iloc[: , :data.columns.get_loc("Var191")].columns
categorical_features = data.iloc[: , data.columns.get_loc("Var191"):].columns

In [5]:
labels = pd.read_csv('train-labels.csv')
labels = labels.drop(columns=['Unnamed: 0'])
labels.head()

Unnamed: 0,Churn
0,-1
1,-1
2,-1
3,-1
4,-1


# Base pipeline
Создадим pipline для подготовки данных. Используем простые методы: запишем нули в пропуски, OneHotEncoder для категорий.
Определим алгоритмы, параметры кроссвалидации, метрики и вспомогательные функции.

In [14]:
# pipeline
num_transformer = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0),
    StandardScaler())

categorical_transformer = make_pipeline(
      SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='empty'),
      OneHotEncoder(handle_unknown = 'ignore')) 

preprocessor_all_features=make_column_transformer(
    (num_transformer, num_features),
    (categorical_transformer, categorical_features),
)


# Метрики
# У нас есть своя метрика, поэтому просто списком передать нельзя.
metrics_={
    'f2': metrics.make_scorer(metrics.fbeta_score, beta=2),
    'f1': metrics.make_scorer(metrics.f1_score),
    'recall': metrics.make_scorer(metrics.recall_score),
    'roc_auc': metrics.make_scorer(metrics.roc_auc_score),
}


# Классификаторы
classifiers=[
    RidgeClassifier(),
    LogisticRegression(),
    SGDClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]


# Параметры
k_fold = 5


# Хелперы
def print_report(clf, scores):
    print(clf)
    for score, values in scores.items():
        print(f'{score} mean: {round(np.mean(values), 5)}')
    print()

Запустим базовый pipeline на всей выборке - получим значения метрик относительно которых будем делать дальнейшие изменения

In [15]:
for clf in classifiers:
    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor_all_features),
        ('classifier', clf)
    ])
    
    scores = cross_validate(pipeline, data, labels, cv=k_fold, scoring=metrics_, n_jobs=-1)
    
    print_report(clf, scores)

RidgeClassifier()
fit_time mean: 15.57956
score_time mean: 0.3493
test_f2 mean: 0.04859
test_f1 mean: 0.06341
test_recall mean: 0.04204
test_roc_auc mean: 0.5098

LogisticRegression()
fit_time mean: 2.72234
score_time mean: 0.35832
test_f2 mean: 0.04866
test_f1 mean: 0.06862
test_recall mean: 0.04076
test_roc_auc mean: 0.51451

SGDClassifier()
fit_time mean: 1.31141
score_time mean: 0.34647
test_f2 mean: 0.06335
test_f1 mean: 0.07764
test_recall mean: 0.0569
test_roc_auc mean: 0.51637

RandomForestClassifier()
fit_time mean: 27.14385
score_time mean: 1.13081
test_f2 mean: 0.00053
test_f1 mean: 0.00085
test_recall mean: 0.00042
test_roc_auc mean: 0.50021

GradientBoostingClassifier()
fit_time mean: 56.97191
score_time mean: 0.3649
test_f2 mean: 0.00738
test_f1 mean: 0.01157
test_recall mean: 0.00594
test_roc_auc mean: 0.50226



Значение метрик около нуля. Сначала попробуем выкинуть признаки с наибольшим кол-вом пропусков. Предположим, что если пропусков больше 95%, то признак можно исключить.

In [16]:
def get_features_na_perc(df, percent):
    na_in_perc = df.isna().sum() / len(df) * 100
    return na_in_perc[na_in_perc < percent].index

def get_cross_featues(a_featues, b_featues):
    return [a for a in a_featues if a in b_featues]

In [18]:
features_95 = get_features_na_perc(data, 95)

preprocessor_95 = make_column_transformer(
    (num_transformer, get_cross_featues(num_features, features_95)),
    (categorical_transformer, get_cross_featues(categorical_features, features_95)),
)

In [19]:
for clf in classifiers:
    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor_95),
        ('classifier', clf)
    ])
    
    scores = cross_validate(pipeline, data, np.ravel(labels), cv=k_fold, scoring=metrics_, n_jobs=-1)
    
    print_report(clf, scores)

RidgeClassifier()
fit_time mean: 7.56057
score_time mean: 0.18152
test_f2 mean: 0.04246
test_f1 mean: 0.05617
test_recall mean: 0.03652
test_roc_auc mean: 0.50789

LogisticRegression()
fit_time mean: 2.28008
score_time mean: 0.18304
test_f2 mean: 0.04227
test_f1 mean: 0.06029
test_recall mean: 0.03524
test_roc_auc mean: 0.51236

SGDClassifier()
fit_time mean: 1.27543
score_time mean: 0.20761
test_f2 mean: 0.03135
test_f1 mean: 0.04587
test_recall mean: 0.0259
test_roc_auc mean: 0.50931

RandomForestClassifier()
fit_time mean: 32.62009
score_time mean: 0.64823
test_f2 mean: 0.0
test_f1 mean: 0.0
test_recall mean: 0.0
test_roc_auc mean: 0.5

GradientBoostingClassifier()
fit_time mean: 28.64177
score_time mean: 0.19596
test_f2 mean: 0.01002
test_f1 mean: 0.01571
test_recall mean: 0.00807
test_roc_auc mean: 0.50334



Значимых изменений нет. Мы знаем, что у нас не сбалансированные классы -1/1. Попробуем сделать простой оверсемплинг, основанный на случайном выборе.

In [20]:
df = pd.concat([data, labels], axis=1)

origin_not_churn = df[df.Churn == -1]
origin_churn = df[df.Churn == 1]

churn_samples = origin_churn.sample(len(origin_not_churn), replace=True, random_state=0)
oversampled = pd.concat([origin_not_churn, churn_samples], axis=0)

len(oversampled.Churn == -1), len(oversampled.Churn == 1)

(59290, 59290)

In [21]:
for clf in classifiers:
    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor_all_features),
        ('classifier', clf)
    ])
    
    scores = cross_validate(pipeline, oversampled.drop(columns=['Churn']), np.ravel(oversampled.Churn),
                            cv=k_fold, scoring=metrics_, n_jobs=-1)
    
    print_report(clf, scores)

RidgeClassifier()
fit_time mean: 81.90059
score_time mean: 0.44981
test_f2 mean: 0.97522
test_f1 mean: 0.94269
test_recall mean: 0.99818
test_roc_auc mean: 0.93932

LogisticRegression()
fit_time mean: 7.77205
score_time mean: 0.46597
test_f2 mean: 0.91071
test_f1 mean: 0.88677
test_recall mean: 0.92741
test_roc_auc mean: 0.88158

SGDClassifier()
fit_time mean: 4.85593
score_time mean: 0.44526
test_f2 mean: 0.95056
test_f1 mean: 0.91736
test_recall mean: 0.97413
test_roc_auc mean: 0.91213

RandomForestClassifier()
fit_time mean: 112.22056
score_time mean: 1.33056
test_f2 mean: 0.99985
test_f1 mean: 0.99973
test_recall mean: 0.99993
test_roc_auc mean: 0.99973

GradientBoostingClassifier()
fit_time mean: 92.22338
score_time mean: 0.46674
test_f2 mean: 0.72777
test_f1 mean: 0.71556
test_recall mean: 0.73614
test_roc_auc mean: 0.70737



Оверсемплинг сереьезно увеличил качество модели. Буем рассматривать RandomForestClassifier, SGDClassifier. Нужно доработать препроцессинг данных (пропуски, энкодинг), отобрать фичи, и рассмотреть другие алгоритмы оверсемплинга.