In [None]:
# !unzip data.zip

In [None]:
# !pip3 install --upgrade pandas

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Входные данные

In [17]:
import pandas as pd
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector, ColumnTransformer

PATH_TO_DATA = 'data'

In [3]:
df = pd.read_csv(os.path.join(PATH_TO_DATA, 'orange_small_churn_data.train'))
y = pd.read_csv(os.path.join(PATH_TO_DATA, 'orange_small_churn_labels.train'), header=None, squeeze=True)

In [4]:
df.shape, y.shape

((40000, 230), (40000,))

In [5]:
class NanColumnsDropper(BaseEstimator, TransformerMixin):
    def __init(self):
        super().__init__()

    def fit(self, X):
        self.nan_cols = X.loc[:, X.isna().all()].columns
        return self

    def transform(self, X):
        X_transformed = X.drop(self.nan_cols, axis=1)
        self.cols = X_transformed.columns
        return X_transformed

In [6]:
# import sklearn

# sklearn.__version__

In [18]:
numeric_transformer = SimpleImputer(strategy='most_frequent')

categorical_transformer = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="object")),
    ('cat', categorical_transformer, selector(dtype_include="object"))
])

preprocessor_pipeline = Pipeline([
    ('nan_columns_dropper', NanColumnsDropper()),
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

In [19]:
# Отделим hold-out dataset
df_train, df_ho, y_train, y_valid = train_test_split(df, y, test_size=5000)
# df_train_ho.to_csv(os.path.join(PATH_TO_DATA, 'hold_out_train_dataset.csv'))

In [20]:
%%time
X_train = preprocessor_pipeline.fit_transform(df_train)

CPU times: user 938 ms, sys: 127 ms, total: 1.07 s
Wall time: 1.08 s


In [21]:
X_train = pd.DataFrame(X_train, columns=preprocessor_pipeline['nan_columns_dropper'].cols).astype('int')
X_train

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
0,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,-1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,-1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,-1,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34995,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
34996,0,0,0,0,0,0,0,0,0,0,...,0,-3,0,0,0,0,1,-2,2,0
34997,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,2,0,0,0
34998,0,0,0,0,0,0,0,0,0,0,...,-1,1,0,2,0,0,0,1,2,0


In [22]:
%%time
X_valid = preprocessor_pipeline.fit_transform(df_ho)

CPU times: user 199 ms, sys: 11.5 ms, total: 211 ms
Wall time: 208 ms


In [23]:
X_valid = pd.DataFrame(X_valid, columns=preprocessor_pipeline['nan_columns_dropper'].cols).astype('int')
X_valid

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
0,0,0,0,0,0,0,-1,0,0,0,...,-1,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,-1,0,0,0,...,0,0,1,2,0,0,0,2,0,0
2,0,0,0,0,0,0,0,0,0,0,...,-1,0,-1,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,-1,1,0,0,0,1,0,1,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,1,0,0,2,0,1,0,0,-1,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4997,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Модель

In [24]:
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [25]:
cv = StratifiedShuffleSplit(n_splits=8, random_state=2179)

## LogisticRegression

In [28]:
%%time
scores = cross_val_score(estimator=LogisticRegression(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='roc_auc')
print(scores, '\n', scores.mean())

[0.64909876 0.64772108 0.66336143 0.64783951 0.63708148 0.64949757
 0.63334965 0.64895858] 
 0.6471135079609656
CPU times: user 80.9 ms, sys: 32.3 ms, total: 113 ms
Wall time: 3.53 s


In [29]:
scores = cross_val_score(estimator=LogisticRegression(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='f1')
print(scores, '\n', scores.mean())

[0.         0.00763359 0.00763359 0.         0.0078125  0.
 0.00769231 0.        ] 
 0.0038464979081033472


In [30]:
scores = cross_val_score(estimator=LogisticRegression(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='precision')
print(scores, '\n', scores.mean())

[0.         0.14285714 0.14285714 0.         1.         0.
 0.2        0.        ] 
 0.18571428571428572


In [31]:
scores = cross_val_score(estimator=LogisticRegression(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='recall')
print(scores, '\n', scores.mean())

[0.         0.00392157 0.00392157 0.         0.00392157 0.
 0.00392157 0.        ] 
 0.00196078431372549


In [32]:
logistic_regression = LogisticRegression(random_state=2179)
logistic_regression.fit(X_train, y_train)
print(classification_report(y_valid, logistic_regression.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.00      0.00      0.00       422

    accuracy                           0.92      5000
   macro avg       0.46      0.50      0.48      5000
weighted avg       0.84      0.92      0.87      5000



In [33]:
roc_auc_score(y_valid, logistic_regression.predict_proba(X_valid)[:, 1])

0.6573386213479261

## RidgeClassifier

In [34]:
%%time
scores = cross_val_score(estimator=RidgeClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='roc_auc')
print(scores, '\n', scores.mean())

[0.6457609  0.64693193 0.66345569 0.65056588 0.63642164 0.65032297
 0.63278045 0.65240037] 
 0.6473299797576967
CPU times: user 105 ms, sys: 61.4 ms, total: 166 ms
Wall time: 1.18 s


In [35]:
scores = cross_val_score(estimator=RidgeClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='f1')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [36]:
scores = cross_val_score(estimator=RidgeClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='precision')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [37]:
scores = cross_val_score(estimator=RidgeClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='recall')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [38]:
ridge_classifier = RidgeClassifier(random_state=2179)
ridge_classifier.fit(X_train, y_train)
print(classification_report(y_valid, ridge_classifier.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.00      0.00      0.00       422

    accuracy                           0.92      5000
   macro avg       0.46      0.50      0.48      5000
weighted avg       0.84      0.92      0.88      5000



In [39]:
roc_auc_score(y_valid, ridge_classifier.predict(X_valid))

0.4998907820008737

## SGDClassifier

In [65]:
%%time
scores = cross_val_score(estimator=SGDClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
print(scores, '\n', scores.mean())

[0.57062509 0.57363667 0.59801565 0.62362186 0.56584368 0.51571407
 0.59333998 0.49476721] 
 0.5669455270551981
CPU times: user 142 ms, sys: 314 ms, total: 456 ms
Wall time: 3.78 s


In [66]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
print(scores, '\n', scores.mean())

[0.0077821  0.         0.         0.0077821  0.00740741 0.01503759
 0.         0.        ] 
 0.00475115046587502


In [67]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='precision')
print(scores, '\n', scores.mean())

[0.5        0.         0.         0.5        0.06666667 0.18181818
 0.         0.        ] 
 0.15606060606060607


In [68]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='recall')
print(scores, '\n', scores.mean())

[0.00392157 0.         0.         0.00392157 0.00392157 0.00784314
 0.         0.        ] 
 0.0024509803921568627


In [69]:
sgd_classifier = SGDClassifier(random_state=2179)
sgd_classifier.fit(X_train, y_train)
print(classification_report(y_valid, sgd_classifier.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.00      0.00      0.00       422

    accuracy                           0.92      5000
   macro avg       0.46      0.50      0.48      5000
weighted avg       0.84      0.92      0.87      5000



In [70]:
roc_auc_score(y_valid, sgd_classifier.predict(X_valid))

0.4996723460026212

### SGDClassifier log loss

In [71]:
%%time
scores = cross_val_score(estimator=SGDClassifier(random_state=2179, loss='log'), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
print(scores, '\n', scores.mean())

[0.63493399 0.61954984 0.63868999 0.63484939 0.60523037 0.63878667
 0.6165999  0.62798514] 
 0.6270781594610109
CPU times: user 70.5 ms, sys: 39.6 ms, total: 110 ms
Wall time: 2.29 s


In [72]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179, loss='log'), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
print(scores, '\n', scores.mean())

[0.00763359 0.00763359 0.         0.01470588 0.00772201 0.
 0.02205882 0.02962963] 
 0.011172939850813673


In [73]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179, loss='log'), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='precision')
print(scores, '\n', scores.mean())

[0.14285714 0.14285714 0.         0.11764706 0.25       0.
 0.17647059 0.26666667] 
 0.137062324929972


In [74]:
scores = cross_val_score(estimator=SGDClassifier(random_state=2179, loss='log'), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='recall')
print(scores, '\n', scores.mean())

[0.00392157 0.00392157 0.         0.00784314 0.00392157 0.
 0.01176471 0.01568627] 
 0.0058823529411764705


In [75]:
sgd_classifier_log = SGDClassifier(random_state=2179, loss='log')
sgd_classifier_log.fit(X_train, y_train)
print(classification_report(y_valid, sgd_classifier_log.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.17      0.00      0.00       422

    accuracy                           0.91      5000
   macro avg       0.54      0.50      0.48      5000
weighted avg       0.85      0.91      0.88      5000



In [76]:
roc_auc_score(y_valid, sgd_classifier_log.predict_proba(X_valid)[:, 1])

0.6263559078137972

## RandomForestClassifier

In [77]:
%%time
random_forest_classifier = RandomForestClassifier(oob_score=True, random_state=2179)
random_forest_classifier.fit(X_train, y_train)
random_forest_classifier.oob_score_

CPU times: user 7.55 s, sys: 35.9 ms, total: 7.59 s
Wall time: 7.58 s


0.9269714285714286

In [78]:
%%time
scores = cross_val_score(estimator=RandomForestClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
print(scores, '\n', scores.mean())

[0.66553733 0.64807638 0.6583921  0.64130276 0.66309617 0.66911387
 0.64850962 0.65472673] 
 0.6560943684099216
CPU times: user 57.3 ms, sys: 50.7 ms, total: 108 ms
Wall time: 10.7 s


In [79]:
scores = cross_val_score(estimator=RandomForestClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='f1')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [80]:
scores = cross_val_score(estimator=RandomForestClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='precision')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [82]:
scores = cross_val_score(estimator=RandomForestClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='recall')
print(scores, '\n', scores.mean())

[0. 0. 0. 0. 0. 0. 0. 0.] 
 0.0


In [83]:
print(classification_report(y_valid, random_forest_classifier.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.00      0.00      0.00       422

    accuracy                           0.92      5000
   macro avg       0.46      0.50      0.48      5000
weighted avg       0.84      0.92      0.88      5000



In [84]:
roc_auc_score(y_valid, random_forest_classifier.predict_proba(X_valid)[:, 1])

0.6673372444764679

## GradientBoostingClassifier

In [85]:
%%time
scores = cross_val_score(estimator=GradientBoostingClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, scoring='roc_auc')
print(scores, scores.mean())

[0.70677906 0.71401432 0.71589414 0.687425   0.70362186 0.72363455
 0.71975649 0.70141636] 0.7090677210791867
CPU times: user 68.5 ms, sys: 40.2 ms, total: 109 ms
Wall time: 14 s


In [86]:
scores = cross_val_score(estimator=GradientBoostingClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='f1')
print(scores, '\n', scores.mean())

[0.0078125  0.0078125  0.0077821  0.         0.         0.
 0.0078125  0.00775194] 
 0.004871442393976412


In [87]:
scores = cross_val_score(estimator=GradientBoostingClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='precision')
print(scores, '\n', scores.mean())

[1.         1.         0.5        0.         0.         0.
 1.         0.33333333] 
 0.47916666666666663


In [88]:
scores = cross_val_score(estimator=GradientBoostingClassifier(random_state=2179), X=X_train, y=y_train, cv=cv, n_jobs=-1, 
                         scoring='recall')
print(scores, '\n', scores.mean())

[0.00392157 0.00392157 0.00392157 0.         0.         0.
 0.00392157 0.00392157] 
 0.0024509803921568627


In [89]:
gb_classifier = GradientBoostingClassifier(random_state=2179)
gb_classifier.fit(X_train, y_train)
print(classification_report(y_valid, gb_classifier.predict(X_valid)))

              precision    recall  f1-score   support

          -1       0.92      1.00      0.96      4578
           1       0.00      0.00      0.00       422

    accuracy                           0.92      5000
   macro avg       0.46      0.50      0.48      5000
weighted avg       0.84      0.92      0.88      5000



In [90]:
roc_auc_score(y_valid, gb_classifier.predict_proba(X_valid)[:, 1])

0.7127077471277219

# Тест

In [91]:
def write_to_submission_file(predicted_labels, out_file,
                             target='result', index_label="id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(0, predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [92]:
df_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'orange_small_churn_test_data.csv'), index_col=0)
df_test

Unnamed: 0_level_0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,1225.0,7.0,,,,...,zCkv,APgdzOv,jySVZNlOJy,,ELof,xb3V,6fzt,Zy3gnGM,,
1,,,,,,896.0,14.0,,,,...,oslk,IIvC99a,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,
2,,,,,,791.0,7.0,,,,...,oslk,6YSocsg,LM8l689qOp,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,mj86,
3,,,,,,2296.0,7.0,,,,...,oslk,5nQ7A2G,jySVZNlOJy,,kG3k,rgKb,RAYp,F2FyR07IdsN7I,am7c,
4,8.0,,,,,,,,28.0,,...,oslk,MI8s5nE,LM8l689qOp,,,7P5s,RAYp,F2FyR07IdsN7I,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,,,,,714.0,7.0,,,,...,zCkv,P6pu4Vl,LM8l689qOp,,ELof,xb3V,ZI9m,R4y5gQQWY8OodqDV,am7c,
9996,,,,,,812.0,7.0,,,,...,oslk,sXbT3Cb,LM8l689qOp,,ELof,WqMG,RAYp,55YFVY9,mj86,
9997,,,,,,819.0,7.0,,,,...,oslk,05jAV0N,M_8D,,ELof,TNEC,RAYp,55YFVY9,am7c,
9998,,,,,,,7.0,,,,...,oslk,RDY7kpB,LM8l689qOp,,,uWr3,RAYp,55YFVY9,,


In [93]:
%%time
X_test = preprocessor_pipeline.fit_transform(df_test)

CPU times: user 318 ms, sys: 10.4 ms, total: 328 ms
Wall time: 327 ms


In [94]:
X_test = pd.DataFrame(X_test, columns=preprocessor_pipeline['nan_columns_dropper'].cols).astype('int')
X_test

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var9,Var10,Var11,...,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229
0,0,0,0,0,0,0,0,0,0,0,...,0,1,-1,2,0,0,2,-1,1,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,2,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,-1,0,-1,0,0,1,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,-1,2,0,1,1,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,2,1,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,-1,0,1,0,0,0,0,0,-1,1
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,0,0,-1,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,-1,0


In [95]:
prediction = gb_classifier.predict_proba(X_test)

In [96]:
write_to_submission_file(prediction[:, 1], 'result.csv')