In [183]:
import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb

In [194]:
df = pd.read_csv('orange_small_churn_train_data.csv', index_col=['ID'])
df['labels'].fillna(0, inplace=True)
labels, train = df['labels'], df.drop(columns=['labels'])
labels.replace({-1: 0}, inplace=True)
labels = labels.values

In [195]:
percent_missing = train.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing}).sort_values('percent_missing', ascending=False)

In [196]:
train = train[list(missing_value_df[missing_value_df.percent_missing < 50.].index)]

In [197]:
train.head()

Unnamed: 0_level_0,Var72,Var94,Var126,Var109,Var149,Var24,Var144,Var81,Var206,Var6,...,Var73,Var113,Var212,Var193,Var210,Var207,Var204,Var196,Var195,Var198
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,4.0,144.0,389396.0,20.0,9.0,14599.92,IYzP,3052.0,...,34,-1209960.0,JBfYVit4g8,AERks4l,uKAI,GjJ35utlTa_GNSvxxpb9ju,k13i,1K8T,taul,UaKK0yW
1,3.0,32289.0,40.0,80.0,735.0,2.0,18.0,67529.09,haYg,1813.0,...,128,417932.0,XfqtO3UdzaXh_,2Knk1KF,uKAI,me75fM6ugJ,FbIm,1K8T,taul,Bnunsla
2,3.0,53388.0,36.0,40.0,0.0,0.0,27.0,85266.0,hAFG,1953.0,...,166,-124655.2,4kVnq_T26xq1p,LrdZy8QqgUfkVShG,uKAI,7M47J5GA0pTYIFxg5uy,mTeA,1K8T,taul,fhk21Ss
3,,,,32.0,0.0,0.0,0.0,74107.2,IYzP,1533.0,...,30,378473.6,NhsEn4L,RO12,uKAI,me75fM6ugJ,vzJD,1K8T,taul,uoZk2Zj
4,3.0,106455.0,-28.0,32.0,554414.0,2.0,9.0,171072.9,zm5i,686.0,...,32,142602.4,NhsEn4L,RO12,uKAI,me75fM6ugJ,m_h1,1K8T,taul,kugYdIL


In [198]:
def fillna_imputer(df):
    '''
    Заменяет Nan значения на строковый маркер 'Nan' и добавляет boolean столбец для каждого признака
    '''
    column_names = df.columns.to_list()
    result = pd.DataFrame()
    for name in column_names:
        result[name] = df[name].fillna(value = 'missing')
        result[name + '_bool'] = df[name].isna()
    return result.reset_index(drop = True)

bool_fillna_imputer = FunctionTransformer(fillna_imputer, validate=False)

In [200]:
numeric_features = list(set([f'Var{i}' for i in range(1, 191)]).intersection(set(train.columns)))
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('variancethreshold', VarianceThreshold(0.1)),
    ('scaler', StandardScaler())])

categorical_features = list(set([f'Var{i}' for i in range(191, 230)]).intersection(set(train.columns)))
categorical_transformer = Pipeline(steps=[
    ('imputer', bool_fillna_imputer),
    ('onehot', ce.CatBoostEncoder(random_state=0)),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [201]:
preprocessor.fit_transform(train, labels.ravel())

array([[-2.94149535e-01, -2.19317379e-01,  5.79649539e-01, ...,
        -7.39261669e-03, -1.27983732e-01,  0.00000000e+00],
       [ 1.16495601e+00,  5.85755583e-01,  1.49970343e-01, ...,
        -7.39261669e-03, -1.27983732e-01,  0.00000000e+00],
       [ 1.21977637e+00,  1.39082855e+00, -1.18579154e-01, ...,
        -7.39261669e-03, -1.27983732e-01,  0.00000000e+00],
       ...,
       [ 1.51397538e+00,  3.80604743e+00, -1.18579154e-01, ...,
        -7.39261669e-03, -1.40522086e+00,  0.00000000e+00],
       [ 1.53238953e+00, -1.02439034e+00, -3.33418752e-01, ...,
        -7.39261669e-03,  3.34409092e-01,  0.00000000e+00],
       [-1.81538844e-02,  2.19590151e+00, -3.87128651e-01, ...,
         1.35270100e+02, -5.76883982e-01,  0.00000000e+00]])

## RidgeRegressor

In [202]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RidgeClassifier(random_state=0, class_weight='balanced'))])

In [203]:
cross_val_score(clf, train, labels, scoring='roc_auc', cv=5, n_jobs=2)

array([0.65897032, 0.66966456, 0.70612773, 0.7006983 , 0.66199549])

In [204]:
cross_val_score(clf, train, labels, scoring='balanced_accuracy', cv=5, n_jobs=2)

array([0.60965758, 0.61971264, 0.6469447 , 0.64893617, 0.60931926])

## RandomForestClassifier

In [205]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=0, class_weight='balanced'))])

In [206]:
cross_val_score(clf, train, labels, scoring='roc_auc', cv=5, n_jobs=2)

array([0.65476997, 0.67880328, 0.6887618 , 0.67133698, 0.65804911])

In [207]:
cross_val_score(clf, train, labels, scoring='balanced_accuracy', cv=5, n_jobs=2)

array([0.5, 0.5, 0.5, 0.5, 0.5])

## GradientBoostingClassifier

In [208]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GradientBoostingClassifier(random_state=0))])

In [209]:
cross_val_score(clf, df, labels, scoring='roc_auc', cv=5, n_jobs=2)

array([0.67453793, 0.71527595, 0.74538697, 0.72359805, 0.71971416])

In [210]:
cross_val_score(clf, df, labels, scoring='balanced_accuracy', cv=5, n_jobs=2)

array([0.50992077, 0.50334094, 0.5083192 , 0.51179463, 0.50515904])

## Submit

In [211]:
test = pd.read_csv('orange_small_churn_test_data.csv')

In [212]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', CatBoostClassifier(iterations=1000,loss_function='Logloss',od_wait=200,verbose=False))])

In [214]:
cross_val_score(clf, train, labels, scoring='balanced_accuracy', cv=5, n_jobs=2)

array([0.50820465, 0.50334094, 0.51209014, 0.51209014, 0.50530679])

In [80]:
clf.fit(df, labels.ravel())

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('variancethreshold',
                                                                   VarianceThreshold(threshold=0.1)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Var57', 'Var144', 'Var109',
                                                   'Var78', 'Var6', 'Var13',
                                                   'Var126', 'Var112', 'Var35',
                                                   'Var25', 'Var94', 'Var76',
                                                   'Var133',

In [81]:
predict = clf.predict(test)

In [82]:
result = pd.DataFrame(predict, columns=['result'])
result.index.name = 'Id'
result.to_csv('submit.csv')