In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import sys
sys.path.append('/Python_ML_lib_classification/Course_project/dev/')
import ml_helper as mlhp

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pd.options.display.float_format = '{:,.2f}'.format

## Обзор данных

### Описание датасета

    Home Ownership - домовладение
    Annual Income - годовой доход
    Years in current job - количество лет на текущем месте работы
    Tax Liens - налоговые обременения
    Number of Open Accounts - количество открытых счетов
    Years of Credit History - количество лет кредитной истории
    Maximum Open Credit - наибольший открытый кредит
    Number of Credit Problems - количество проблем с кредитом
    Months since last delinquent - количество месяцев с последней просрочки платежа
    Bankruptcies - банкротства
    Purpose - цель кредита
    Term - срок кредита
    Current Loan Amount - текущая сумма кредита
    Current Credit Balance - текущий кредитный баланс
    Monthly Debt - ежемесячный долг
    Credit Default - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)



### Настраиваем константы и загружаем данные


In [2]:
TRAIN_DATASET_PATH = 'course_project_train.csv'
TEST_DATASET_PATH = 'course_project_test.csv'
TARGET = 'Credit Default'

df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_test = pd.read_csv(TEST_DATASET_PATH)

### Анализируем обучающий данные

In [3]:
df_train.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [4]:
df_train[TARGET].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

In [5]:
df_train.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
Annual Income,1.0,0.07,0.19,0.19,0.06,-0.02,-0.1,-0.07,0.03,0.39,0.58,-0.04,-0.1
Tax Liens,0.07,1.0,-0.01,0.01,-0.0,0.6,0.05,0.06,-0.01,-0.01,0.03,0.01,0.02
Number of Open Accounts,0.19,-0.01,1.0,0.14,0.02,-0.02,-0.03,-0.02,0.0,0.27,0.41,0.02,0.03
Years of Credit History,0.19,0.01,0.14,1.0,0.03,0.07,-0.02,0.09,0.01,0.22,0.19,-0.02,-0.02
Maximum Open Credit,0.06,-0.0,0.02,0.03,1.0,-0.0,-0.0,-0.01,-0.0,0.09,0.01,-0.01,-0.01
Number of Credit Problems,-0.02,0.6,-0.02,0.07,-0.0,1.0,0.14,0.73,-0.01,-0.12,-0.04,-0.02,0.02
Months since last delinquent,-0.1,0.05,-0.03,-0.02,-0.0,0.14,1.0,0.16,0.0,-0.03,-0.06,-0.0,0.0
Bankruptcies,-0.07,0.06,-0.02,0.09,-0.01,0.73,0.16,1.0,-0.01,-0.14,-0.08,-0.03,0.0
Current Loan Amount,0.03,-0.01,0.0,0.01,-0.0,-0.01,0.0,-0.01,1.0,0.02,-0.01,-0.11,-0.23
Current Credit Balance,0.39,-0.01,0.27,0.22,0.09,-0.12,-0.03,-0.14,0.02,1.0,0.5,-0.01,-0.02


In [6]:
df_train.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366391.72,0.03,11.13,18.32,945153.73,0.17,34.69,0.12,11873177.45,289833.24,18314.45,1151.09,0.28
std,845339.2,0.27,4.91,7.04,16026216.67,0.5,21.69,0.35,31926122.97,317871.38,11926.76,1604.45,0.45
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.25,23818.0,743.0,1.0
max,10149344.0,7.0,43.0,57.7,1304726170.0,7.0,118.0,4.0,99999999.0,6506797.0,136679.0,7510.0,1.0


In [7]:
mlhp.get_object_value_counts(df_train)

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
****************************************************************************************************

Years in current job

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
****************************************************************************************************

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation  

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [9]:
mlhp.get_nan_percent(df_train)

Home Ownership                  0.00
Annual Income                  20.76
Years in current job            4.95
Tax Liens                       0.00
Number of Open Accounts         0.00
Years of Credit History         0.00
Maximum Open Credit             0.00
Number of Credit Problems       0.00
Months since last delinquent   54.41
Bankruptcies                    0.19
Purpose                         0.00
Term                            0.00
Current Loan Amount             0.00
Current Credit Balance          0.00
Monthly Debt                    0.00
Credit Score                   20.76
Credit Default                  0.00
dtype: float64


### После анализа видим:
По таблице корриляции видим зависимости целевой переменной с: 
- Credit Score 0.44
- Current Loan Amount -0.23
- Annual Income - 0.10

Присутствуют пропущеные данные по:
- Annual Income
- Years in current job
- Months since last delinquent 
- Bankruptcies
- Credit Score

## Проанализируем тестовый датасет

In [10]:
mlhp.get_nan_percent(df_test)

Home Ownership                  0.00
Annual Income                  20.52
Years in current job            3.44
Tax Liens                       0.00
Number of Open Accounts         0.00
Years of Credit History         0.00
Maximum Open Credit             0.00
Number of Credit Problems       0.00
Months since last delinquent   54.32
Bankruptcies                    0.12
Purpose                         0.00
Term                            0.00
Current Loan Amount             0.00
Current Credit Balance          0.00
Monthly Debt                    0.00
Credit Score                   20.52
dtype: float64


### Построение новых признаков

In [11]:
# mlhp.add_id(df_train)
# mlhp.add_id(df_test)

In [12]:
def home_ownership_convert(df: pd.DataFrame):
    df['Home Ownership'] = df['Home Ownership'].map({'Home Mortgage':'1','Rent':'2','Own Home':'3','Have Mortgage':'4'}).astype(int)
        #.astype('category')


In [13]:
def years_in_current_job_convert(df: pd.DataFrame):
    df['Years in current job'] = df['Years in current job'].map({
        '10+ years':'1', 
        '2 years':'2', 
        '3 years':'3', 
        '< 1 year':'4', 
        '5 years':'5', 
        '1 year':'6', 
        '4 years':'7', 
        '6 years':'8', 
        '7 years':'9', 
        '8 years':'10', 
        '9 years':'11' 
    }).astype(int)
        # .astype('category')


In [14]:
def purpose_convert(df: pd.DataFrame):
    df['Purpose'] = df['Purpose'].map({
        'debt consolidation':'1', 
        'other':'2', 
        'home improvements':'3', 
        'business loan':'4', 
        'buy a car':'5', 
        'medical bills':'6', 
        'major purchase':'7', 
        'take a trip':'8', 
        'buy house':'9', 
        'small business':'10', 
        'wedding':'11', 
        'moving':'12', 
        'educational expenses':'13', 
        'vacation':'14', 
        'renewable energy':'15' 
    }).astype(int)

In [15]:
def term_convert(df: pd.DataFrame):
    df['Term'] = df['Term'].map({
        'Short Term':'1', 
        'Long Term':'2'
    }).astype(int)

In [16]:
mlhp.fillna_dummines_mode(df_train) 
mlhp.fillna_dummines_mode(df_test)

In [17]:
# Home Ownership
home_ownership_convert(df_train)
home_ownership_convert(df_test)


In [18]:
# Years in current job
years_in_current_job_convert(df_train)
years_in_current_job_convert(df_test)

In [19]:
# Purpose
purpose_convert(df_train)
purpose_convert(df_test)


In [20]:
# Term
term_convert(df_train)
term_convert(df_test)

In [21]:
mlhp.get_nan_count(df_train)

Home Ownership                     0
Annual Income                   1557
Years in current job               0
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64


In [22]:
mlhp.fillna_float64_median(df_train)
mlhp.fillna_float64_median(df_test)

In [23]:
df_train['Years in current job'].value_counts()

1     2703
2      705
3      620
4      563
5      516
6      504
7      469
8      426
9      396
10     339
11     259
Name: Years in current job, dtype: int64

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   int32  
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int32  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   int32  
 11  Term                          7500 non-null   int32  
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [25]:
df_train.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
Home Ownership,1.0,-0.18,0.03,0.01,-0.1,-0.14,-0.01,0.01,0.03,0.0,0.05,-0.08,-0.01,-0.13,-0.18,0.03,0.05
Annual Income,-0.18,1.0,-0.02,0.06,0.17,0.16,0.02,-0.01,-0.06,-0.06,0.04,0.09,0.05,0.35,0.52,-0.02,-0.1
Years in current job,0.03,-0.02,1.0,-0.02,0.01,-0.15,-0.01,-0.05,0.01,-0.05,0.01,-0.02,0.01,-0.04,-0.01,0.0,-0.01
Tax Liens,0.01,0.06,-0.02,1.0,-0.01,0.01,-0.0,0.6,0.05,0.06,0.01,0.0,-0.01,-0.01,0.03,0.01,0.02
Number of Open Accounts,-0.1,0.17,0.01,-0.01,1.0,0.14,0.02,-0.02,-0.02,-0.02,-0.05,0.07,0.0,0.27,0.41,0.01,0.03
Years of Credit History,-0.14,0.16,-0.15,0.01,0.14,1.0,0.03,0.07,0.0,0.09,-0.0,0.04,0.01,0.22,0.19,-0.01,-0.02
Maximum Open Credit,-0.01,0.02,-0.01,-0.0,0.02,0.03,1.0,-0.0,-0.0,-0.01,0.01,-0.01,-0.0,0.09,0.01,-0.01,-0.01
Number of Credit Problems,0.01,-0.01,-0.05,0.6,-0.02,0.07,-0.0,1.0,0.11,0.73,0.01,-0.02,-0.01,-0.12,-0.04,-0.02,0.02
Months since last delinquent,0.03,-0.06,0.01,0.05,-0.02,0.0,-0.0,0.11,1.0,0.11,-0.01,0.01,-0.0,-0.03,-0.04,-0.0,0.0
Bankruptcies,0.0,-0.06,-0.05,0.06,-0.02,0.09,-0.01,0.73,0.11,1.0,-0.02,-0.03,-0.01,-0.14,-0.08,-0.02,0.0


После обработки значений object, влиять на целевую переменную начал параметр Term. Обработаем еще немногокатегориальные переменные

In [26]:
def create_category_columns(df: pd.DataFrame) -> pd.DataFrame:
    df['Term'] = df['Term'].astype('category')
    df['Home Ownership'] = df['Home Ownership'].astype('category')
    return pd.get_dummies(df)
    

In [27]:
# df_train = create_category_columns(df_train)
# df_test = create_category_columns(df_test)

In [28]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   int32  
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int32  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   int32  
 11  Term                          7500 non-null   int32  
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

## Классификация

In [29]:
import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#import xgboost as xgb, lightgbm as lgbm, catboost as catb

In [30]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

Разбиение на train и test

In [31]:
target = "Credit Default"

X = df_train.drop(target, axis=1)
y = df_train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.30, random_state=21)

In [32]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.76      0.98      0.86      3771
           1       0.81      0.21      0.33      1479

    accuracy                           0.76      5250
   macro avg       0.78      0.59      0.59      5250
weighted avg       0.77      0.76      0.71      5250

TEST

              precision    recall  f1-score   support

           0       0.76      0.98      0.85      1616
           1       0.77      0.21      0.33       634

    accuracy                           0.76      2250
   macro avg       0.77      0.59      0.59      2250
weighted avg       0.76      0.76      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1577   39
1                501  133
