Importing libraries and functions

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Importing dataset

In [2]:
data_df = pd.read_csv('application_record.csv')
target_df = pd.read_csv('credit_record.csv')

Data preparation

In [3]:
data_df.head(5)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [4]:
data_df.shape

(438557, 18)

In [5]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [6]:
data_df.describe()

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS
count,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0
mean,6022176.0,0.42739,187524.3,-15997.904649,60563.675328,1.0,0.206133,0.287771,0.108207,2.194465
std,571637.0,0.724882,110086.9,4185.030007,138767.799647,0.0,0.404527,0.452724,0.310642,0.897207
min,5008804.0,0.0,26100.0,-25201.0,-17531.0,1.0,0.0,0.0,0.0,1.0
25%,5609375.0,0.0,121500.0,-19483.0,-3103.0,1.0,0.0,0.0,0.0,2.0
50%,6047745.0,0.0,160780.5,-15630.0,-1467.0,1.0,0.0,0.0,0.0,2.0
75%,6456971.0,1.0,225000.0,-12514.0,-371.0,1.0,0.0,1.0,0.0,3.0
max,7999952.0,19.0,6750000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,20.0


In [7]:
data_df.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

In [8]:
# categorical feature encoding
data_df['CODE_GENDER'] = data_df['CODE_GENDER'].map({'M' : 1 , 'F' : 0})
data_df['FLAG_OWN_CAR'] = data_df['FLAG_OWN_CAR'].map({'Y' : 1 , 'N' : 0})
data_df['FLAG_OWN_REALTY'] = data_df['FLAG_OWN_REALTY'].map({'Y' : 1 , 'N' : 0})

In [9]:
data_df.head(5)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,1,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,1,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [10]:
# Imputation
data_df['OCCUPATION_TYPE'] = data_df['OCCUPATION_TYPE'].fillna(data_df['OCCUPATION_TYPE'].mode()[0])
# One-Hot Encoding
data_df = pd.get_dummies(data_df, columns = ['NAME_INCOME_TYPE'], drop_first=True, dtype = int)
data_df = pd.get_dummies(data_df, columns = ['NAME_EDUCATION_TYPE'], drop_first=True, dtype = int)
data_df = pd.get_dummies(data_df, columns = ['NAME_FAMILY_STATUS'], drop_first=True, dtype = int)
data_df = pd.get_dummies(data_df, columns = ['NAME_HOUSING_TYPE'], drop_first=True, dtype = int)
data_df = pd.get_dummies(data_df, columns = ['OCCUPATION_TYPE'], drop_first=True, dtype = int)

In [11]:
# Feature Creation and Cleaning
data_df['UNEMPLOYED'] = (data_df['DAYS_EMPLOYED'] > 0).astype(int)
data_df['YEARS_EMPLOYED'] = - data_df['DAYS_EMPLOYED'] / 365.25
data_df.loc[data_df['UNEMPLOYED'] > 0, 'YEARS_EMPLOYED'] = 0
data_df = data_df.drop('DAYS_EMPLOYED', axis = 1)
data_df['YEARS_BIRTH'] = - data_df['DAYS_BIRTH'] / 365.25
data_df = data_df.drop('DAYS_BIRTH', axis = 1)
data_df = data_df.drop('UNEMPLOYED', axis = 1)
data_df = data_df.drop('FLAG_MOBIL', axis = 1)
data_df = data_df.drop('FLAG_WORK_PHONE', axis = 1)
data_df = data_df.drop('FLAG_PHONE', axis = 1)
data_df = data_df.drop('FLAG_EMAIL', axis = 1)

# Data Handling
data_df['CNT_FAM_MEMBERS'] = data_df['CNT_FAM_MEMBERS'].replace(0, 1)
# New Feature Generation
data_df['INCOME_PER_FAMILY'] = data_df['AMT_INCOME_TOTAL'] / data_df['CNT_FAM_MEMBERS']
data_df['EMPLOYMENT_TO_AGE_RATIO'] = data_df['YEARS_EMPLOYED'] / data_df['YEARS_BIRTH']

In [12]:
data_df.head(5)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,...,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,YEARS_EMPLOYED,YEARS_BIRTH,INCOME_PER_FAMILY,EMPLOYMENT_TO_AGE_RATIO
0,5008804,1,1,1,0,427500.0,2.0,0,0,0,...,0,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342
1,5008805,1,1,1,0,427500.0,2.0,0,0,0,...,0,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342
2,5008806,1,1,1,0,112500.0,2.0,0,0,0,...,0,0,0,0,1,0,3.104723,58.792608,56250.0,0.052808
3,5008808,0,0,1,0,270000.0,1.0,0,0,0,...,0,0,1,0,0,0,8.353183,52.320329,270000.0,0.159655
4,5008809,0,0,1,0,270000.0,1.0,0,0,0,...,0,0,1,0,0,0,8.353183,52.320329,270000.0,0.159655


In [13]:
target_df.head(5)

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [14]:
# encoding credit status into a binary target variable
target_df['TARGET'] = target_df['STATUS'].apply(lambda x: 0 if x in ['X','C'] else 1)

In [15]:
target_df.head(5)

Unnamed: 0,ID,MONTHS_BALANCE,STATUS,TARGET
0,5001711,0,X,0
1,5001711,-1,0,1
2,5001711,-2,0,1
3,5001711,-3,0,1
4,5001712,0,C,0


In [16]:
# aggregation client data to determine their overall credit status
target_df = target_df.groupby('ID')['TARGET'].max().reset_index()
target_df.head(5)

Unnamed: 0,ID,TARGET
0,5001711,1
1,5001712,1
2,5001713,0
3,5001714,0
4,5001715,0


In [17]:
# merging two DataFrames 
data = data_df.merge(target_df,on = 'ID')

In [18]:
data.head(5)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,...,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,YEARS_EMPLOYED,YEARS_BIRTH,INCOME_PER_FAMILY,EMPLOYMENT_TO_AGE_RATIO,TARGET
0,5008804,1,1,1,0,427500.0,2.0,0,0,0,...,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342,1
1,5008805,1,1,1,0,427500.0,2.0,0,0,0,...,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342,1
2,5008806,1,1,1,0,112500.0,2.0,0,0,0,...,0,0,0,1,0,3.104723,58.792608,56250.0,0.052808,1
3,5008808,0,0,1,0,270000.0,1.0,0,0,0,...,0,1,0,0,0,8.353183,52.320329,270000.0,0.159655,1
4,5008809,0,0,1,0,270000.0,1.0,0,0,0,...,0,1,0,0,0,8.353183,52.320329,270000.0,0.159655,0


In [19]:
data.isnull().sum()

ID                                                   0
CODE_GENDER                                          0
FLAG_OWN_CAR                                         0
FLAG_OWN_REALTY                                      0
CNT_CHILDREN                                         0
AMT_INCOME_TOTAL                                     0
CNT_FAM_MEMBERS                                      0
NAME_INCOME_TYPE_Pensioner                           0
NAME_INCOME_TYPE_State servant                       0
NAME_INCOME_TYPE_Student                             0
NAME_INCOME_TYPE_Working                             0
NAME_EDUCATION_TYPE_Higher education                 0
NAME_EDUCATION_TYPE_Incomplete higher                0
NAME_EDUCATION_TYPE_Lower secondary                  0
NAME_EDUCATION_TYPE_Secondary / secondary special    0
NAME_FAMILY_STATUS_Married                           0
NAME_FAMILY_STATUS_Separated                         0
NAME_FAMILY_STATUS_Single / not married              0
NAME_FAMIL

In [20]:
data = data.drop('ID', axis = 1)

In [21]:
data.head(4)

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Working,...,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,YEARS_EMPLOYED,YEARS_BIRTH,INCOME_PER_FAMILY,EMPLOYMENT_TO_AGE_RATIO,TARGET
0,1,1,1,0,427500.0,2.0,0,0,0,1,...,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342,1
1,1,1,1,0,427500.0,2.0,0,0,0,1,...,0,0,0,0,0,12.435318,32.867899,213750.0,0.378342,1
2,1,1,1,0,112500.0,2.0,0,0,0,1,...,0,0,0,1,0,3.104723,58.792608,56250.0,0.052808,1
3,0,0,1,0,270000.0,1.0,0,0,0,0,...,0,1,0,0,0,8.353183,52.320329,270000.0,0.159655,1


In [22]:
class_counts = data['TARGET'].value_counts()
print(class_counts)

TARGET
1    32002
0     4455
Name: count, dtype: int64


Risk Model building

In [23]:
# Splitting the data into training and testing sets and scaling the numerical featuresy 
y = data['TARGET']
X = data.drop('TARGET', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
numerical_features = [
    'CNT_CHILDREN',
    'AMT_INCOME_TOTAL',
    'CNT_FAM_MEMBERS',
    'YEARS_EMPLOYED',
    'YEARS_BIRTH',
    'INCOME_PER_FAMILY',
    'EMPLOYMENT_TO_AGE_RATIO'
]
scaler = StandardScaler()
scaler.fit(X_train[numerical_features])
X_train[numerical_features] = scaler.transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [24]:
# training a Logistic Regression model 
model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [25]:
# training a Random Forest model with a focus on addressing class imbalance
model_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
y_pred_proba_rf = model_rf.predict_proba(X_test)[:, 1]

In [26]:
# training a Gradient Boosting model
model_gb = GradientBoostingClassifier(random_state = 42)
model_gb.fit(X_train, y_train)
y_gb_pred = model_gb.predict(X_test)
y_gb_pred_proba = model_gb.predict_proba(X_test)[:, 1]

Model performance

In [27]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC для Логической регрессии: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.4841
ROC-AUC для Логической регрессии: 0.5017
Confusion Matrix:
[[ 478  413]
 [3349 3052]]


In [28]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC-AUC для Случайного леса: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")
print("Confusion Matrix для Случайного леса:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy: 0.8276
ROC-AUC для Случайного леса: 0.7670
Confusion Matrix для Случайного леса:
[[ 483  408]
 [ 849 5552]]


In [29]:
print(f"Accuracy: {accuracy_score(y_test, y_gb_pred):.4f}")
print(f"ROC-AUC для Градиентного бустинга: {roc_auc_score(y_test, y_gb_pred_proba):.4f}")
print("Confusion Matrix для Градиентного бустинга:")
print(confusion_matrix(y_test, y_gb_pred))

Accuracy: 0.8781
ROC-AUC для Градиентного бустинга: 0.6236
Confusion Matrix для Градиентного бустинга:
[[   2  889]
 [   0 6401]]
