Решить задачу просрочки кредита используя для прогназирования нейронные сети. Посмотреть работу с несбалансированными данными. 


Набор данных представляет собой исторические данные по 251503 заемщикам (https://www.kaggle.com/c/GiveMeSomeCredit/data ).
Задача является бинарной классификацией. Цель - предсказать будет ли тот или иной заемщик испытывать финансовые трудности в ближайшие 2 года, т.е. будет ли просрочка по займу. 
Выборка разделена на тренировочную и тестовую ( 150000 в тренировочной части, 101503 в тестовой).

In [68]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, accuracy_score

import xgboost as xgb

import catboost as cb

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE






In [39]:
df_train = pd.read_csv('cs-training.csv', index_col=0)

Variable Name	Description	Type
SeriousDlqin2yrs	Person experienced 90 days past due delinquency or worse 	Y/N
RevolvingUtilizationOfUnsecuredLines	Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits	percentage
age	Age of borrower in years	integer
NumberOfTime30-59DaysPastDueNotWorse	Number of times borrower has been 30-59 days past due but no worse in the last 2 years.	integer
DebtRatio	Monthly debt payments, alimony,living costs divided by monthy gross income	percentage
MonthlyIncome	Monthly income	real
NumberOfOpenCreditLinesAndLoans	Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)	integer
NumberOfTimes90DaysLate	Number of times borrower has been 90 days or more past due.	integer
NumberRealEstateLoansOrLines	Number of mortgage and real estate loans including home equity lines of credit	integer
NumberOfTime60-89DaysPastDueNotWorse	Number of times borrower has been 60-89 days past due but no worse in the last 2 years.	integer
NumberOfDependents	Number of dependents in family excluding themselves (spouse, children etc.)	integer

In [5]:
df_train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
df_train[df_train['RevolvingUtilizationOfUnsecuredLines'] > 1]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
163,1,1.046279,47,1,1.104301,5416.0,6,0,2,0,1.0
192,0,1.095083,53,5,0.536704,3500.0,7,1,1,2,0.0
227,1,1.953488,38,0,0.153500,3556.0,5,0,0,0,2.0
252,1,1.048211,58,7,0.151957,3500.0,12,0,0,0,0.0
294,0,2340.000000,45,0,0.339333,8333.0,7,0,2,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
149940,0,1.049900,26,0,0.073220,6500.0,6,0,0,1,0.0
149956,1,1.135552,41,2,0.845887,7500.0,12,0,4,1,0.0
149963,0,1.005733,48,0,0.248896,2944.0,4,3,0,0,1.0
149965,0,1.010934,63,0,0.608211,6015.0,8,0,1,0,0.0


In [7]:
df_train[df_train['DebtRatio'] > 1]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
7,0,0.305682,57,0,5710.000000,,8,0,3,0,0.0
9,0,0.116951,27,0,46.000000,,2,0,0,0,
15,0,0.019657,76,0,477.000000,0.0,6,0,1,0,0.0
17,0,0.061086,78,0,2058.000000,,10,0,2,0,0.0
26,1,0.392248,50,0,1.595253,4676.0,14,0,3,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
149977,0,0.000627,76,0,60.000000,,5,0,0,0,0.0
149978,0,0.236450,29,0,349.000000,,3,0,0,0,0.0
149985,0,0.037548,84,0,25.000000,,5,0,0,0,0.0
149993,0,0.871976,50,0,4132.000000,,11,0,1,0,3.0


In [8]:

df_train = df_train.rename(columns=lambda x: x.replace('-', '_'))
df_train.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30_59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60_89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30_59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60_89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

Анализ данных показывает, что все данные числовые, но есть два признака с пропущенными значениями

In [25]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeriousDlqin2yrs,150000.0,0.06684,0.249746,0.0,0.0,0.0,0.0,1.0
RevolvingUtilizationOfUnsecuredLines,150000.0,6.048438,249.755371,0.0,0.029867,0.154181,0.559046,50708.0
age,150000.0,52.295207,14.771866,0.0,41.0,52.0,63.0,109.0
NumberOfTime30-59DaysPastDueNotWorse,150000.0,0.421033,4.192781,0.0,0.0,0.0,0.0,98.0
DebtRatio,150000.0,353.005076,2037.818523,0.0,0.175074,0.366508,0.868254,329664.0
MonthlyIncome,120269.0,6670.221237,14384.674215,0.0,3400.0,5400.0,8249.0,3008750.0
NumberOfOpenCreditLinesAndLoans,150000.0,8.45276,5.145951,0.0,5.0,8.0,11.0,58.0
NumberOfTimes90DaysLate,150000.0,0.265973,4.169304,0.0,0.0,0.0,0.0,98.0
NumberRealEstateLoansOrLines,150000.0,1.01824,1.129771,0.0,0.0,1.0,2.0,54.0
NumberOfTime60-89DaysPastDueNotWorse,150000.0,0.240387,4.155179,0.0,0.0,0.0,0.0,98.0


In [14]:
df_train["NumberOfTime30_59DaysPastDueNotWorse"].value_counts()

0     126018
1      16033
2       4598
3       1754
4        747
5        342
98       264
6        140
7         54
8         25
9         12
96         5
10         4
12         2
13         1
11         1
Name: NumberOfTime30_59DaysPastDueNotWorse, dtype: int64

In [15]:
df_train["SeriousDlqin2yrs"].value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

In [16]:

df_train["NumberOfTimes90DaysLate"].value_counts()

0     141662
1       5243
2       1555
3        667
4        291
98       264
5        131
6         80
7         38
8         21
9         19
10         8
96         5
11         5
13         4
15         2
14         2
12         2
17         1
Name: NumberOfTimes90DaysLate, dtype: int64

In [17]:
df_train["NumberOfTime60_89DaysPastDueNotWorse"].value_counts()

0     142396
1       5731
2       1118
3        318
98       264
4        105
5         34
6         16
7          9
96         5
8          2
11         1
9          1
Name: NumberOfTime60_89DaysPastDueNotWorse, dtype: int64

In [34]:
st_scaler = StandardScaler()
st_scaler.fit(df_train)


In [40]:
def data_preparation(data):
    
    data = data.rename(columns=lambda x: x.replace('-', '_'))
    data = data.drop(data[data["NumberOfTime30_59DaysPastDueNotWorse"]>=96].index)
    data = data.drop(data[data["NumberOfTimes90DaysLate"]>=96].index)
    data = data.drop(data[data["NumberOfTime60_89DaysPastDueNotWorse"]>=96].index)
    data = data.drop(data[data["age"]==0].index)
    data = data.drop(data[data['DebtRatio'] > 1].index)
    data = data.drop(data[data['RevolvingUtilizationOfUnsecuredLines'] > 1].index)
    data = data.dropna()

    return data
    

In [44]:
X = data_preparation(df_train)

In [42]:
X

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30_59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60_89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.385742,50,0,0.404293,3400.0,7,0,0,0,0.0
149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [45]:
Y = X["SeriousDlqin2yrs"]
X = X.drop(["SeriousDlqin2yrs"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)
st_scaler = StandardScaler()
st_scaler.fit(X_train)
st_scaler.fit(X_test)

In [46]:
X.shape

(110289, 10)

In [43]:
# ! pip install imbalanced-learn




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
ada = ADASYN(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    n_neighbors=5,
)

X_res_train, y_res_train = ada.fit_resample(X_train, y_train)

In [69]:
sm = SMOTE(random_state=42, k_neighbors=5)
X_res_train, y_res_train = sm.fit_resample(X_train, y_train)

In [70]:
y_res_train.value_counts()

0    77821
1    77821
Name: SeriousDlqin2yrs, dtype: int64

In [49]:
X_res_train.shape

(155993, 10)

In [71]:
# Определяем модель RandomForestClassifier
rf_model = RandomForestClassifier()

# Определяем сетку гиперпараметров для GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],          # Количество деревьев в ансамбле
    'max_depth': [None, 10, 20, 30],         # Максимальная глубина деревьев
    'min_samples_split': [2, 5, 10],         # Минимальное количество примеров, необходимое для разделения узла
    'min_samples_leaf': [1, 2, 4]            # Минимальное количество примеров в листовых узлах
}

# Инициализируем GridSearchCV с моделью и сеткой гиперпараметров
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Обучаем модель на обучающих данных с помощью GridSearchCV
grid_search.fit(X_res_train, y_res_train)

# Получаем наилучшие параметры и модель
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Оцениваем модель на тестовых данных
y_pred_rf = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
classification_rep = classification_report(y_test, y_pred_rf)
# print(roc_auc_score(y_test, rf_model.predict_proba(X_valid_qt)[:,1]))
print(roc_auc_score(y_test, y_pred_rf))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
0.6417645776348109


In [72]:
print(classification_rep)

              precision    recall  f1-score   support

           0       0.96      0.93      0.94     25882
           1       0.25      0.35      0.29      1691

    accuracy                           0.90     27573
   macro avg       0.61      0.64      0.62     27573
weighted avg       0.91      0.90      0.90     27573



In [63]:
# формирование Pipeline
XGB_clf = xgb.XGBClassifier()



# Создание параметров модели
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
        
}

# Создание сетки параметров для перебора
param_grid = {
    'classifier__max_depth': [3, 5, 7, 9],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__n_estimators': [100, 500, 1000]
}

# Создание объекта GridSearchCV
grid_search = GridSearchCV(estimator=XGB_clf,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1)

# Поиск наилучших параметров
grid_search.fit(X_res_train, y_res_train)

# Вывод наилучших параметров
print("Наилучшие параметры:", grid_search.best_params_)

# Получение наилучшей модели
best_model = grid_search.best_estimator_

# Предсказание на тестовой выборке
pred_XGB_clf = best_model.predict(X_test)
print(classification_report(y_test, pred_XGB_clf))

Parameters: { "classifier__learning_rate", "classifier__max_depth", "classifier__n_estimators" } are not used.

Наилучшие параметры: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     25882
           1       0.23      0.36      0.28      1691

    accuracy                           0.89     27573
   macro avg       0.59      0.64      0.61     27573
weighted avg       0.91      0.89      0.90     27573



In [64]:
print(roc_auc_score(y_test, pred_XGB_clf))

0.6408797448603453
