In [46]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [28]:
#считываем данные 
df = pd.read_csv('ML_final_preprocessing.csv')
df.head()

Unnamed: 0,id,pre_since_opened_0,pre_since_opened_1,pre_since_opened_2,pre_since_opened_3,pre_since_opened_4,pre_since_opened_5,pre_since_opened_6,pre_since_opened_7,pre_since_opened_8,...,enc_loans_credit_type_6,enc_loans_credit_type_7,pre_loans_max_overdue_sum_0,pre_loans3060_0,pre_loans3060_3,pre_loans5_10,pre_loans530_17,pre_loans3060_4,pre_loans6090_0,flag
0,0,0,1,1,1,1,2,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,0,0,1,0,0,0,0,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,0,3,1,0,2,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [29]:
df.shape

(3000000, 416)

In [30]:
# разделим на обучающую и отложенную выборку
df_train, df_test = train_test_split(df, stratify=df['flag'], test_size = 0.3, random_state = 1)

In [31]:
# работаем с положительными и отрицательными значениями

df_min = df_train[df_train['flag'] == 1]
df_maj = df_train[df_train['flag'] == 0]

df_maj_dwn = resample(df_maj, replace = False, n_samples = len(df_min), random_state = 1)

In [32]:
print(df_maj_dwn.shape)
print(df_min.shape)

(74509, 416)
(74509, 416)


In [33]:
# склеиваем обе части и смотрим размерность сета
df_train_ump = pd.concat([df_maj_dwn, df_min], ignore_index=True).sample(frac = 1)

In [34]:
df_train_ump.shape

(149018, 416)

In [47]:
def fit():
    X = df_train_ump.drop('flag', axis = 1)
    y = df_train_ump['flag']
    
    num_transf = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    column_trans = Pipeline(steps=[
        ('numerical', num_transf)
    ])
    
    preprocessing = Pipeline(steps=[
        ('column_transforms', column_trans)    
    ])
    
    
    models = [
        LogisticRegression(solver = 'liblinear', penalty = 'l2'),
        RandomForestClassifier(max_depth = 200),
        KNeighborsClassifier(n_neighbors=15),
        MLPRegressor(),
        XGBRegressor()
    ]
    
    best_score = .0
    best_pipe = None
    for model in models:
        pipe = Pipeline(steps = [
            ('preprocessing', preprocessing),
            ('models', model)
        ])
        
        score = cross_val_score(pipe, X, y, cv = 4, scoring = 'roc_auc')
        print(f'model: {type(model).__name__}, roc_auc: {score.mean():.4f}')

In [48]:
#выведем результат работы
fit()

model: LogisticRegression, roc_auc: 0.7347
model: RandomForestClassifier, roc_auc: 0.7380
model: KNeighborsClassifier, roc_auc: 0.6942
model: MLPRegressor, roc_auc: 0.6954
model: XGBRegressor, roc_auc: 0.7496


In [None]:
# видим, что лучший резудьтат у XBG, попробуем еще улучшить модель 

In [71]:
params = {'max_depth': [3, 5, 7],
          'learning_rate': [0.1, 0.01, 0.001]}

In [72]:
gsc_xgb = GridSearchCV(XGBRegressor(), params, cv = 4)
X = df_train_ump.drop('flag', axis = 1)
y = df_train_ump['flag']
gsc_xgb.fit(X, y)

In [73]:
gsc_xgb.best_params_

{'learning_rate': 0.1, 'max_depth': 7}

In [74]:
xgb_best = XGBRegressor(learning_rate = 0.1, max_depth = 7)
xgb_best.fit(X,y)

In [75]:
# Пробуем на тестовой выборке 
Xtest = df_test.drop('flag', axis = 1)
ytest = df_test['flag']

test_predict = xgb_best.predict(Xtest)

In [76]:
roc_auc_score(ytest, test_predict)

0.7561335048299193

Цель достигнута! 