# Import библиотек

In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Набор данных

In [78]:
df = pd.read_csv('/content/drive/MyDrive/ML-projects/loan_prediction.csv', delimiter=',')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## Анализ признаков

In [79]:
#Преобразуем название столбцов
df.columns = df.columns.str.lower()

In [80]:
#loan_id - не нужно для анализа
df = df.drop('loan_id', axis=1)
#gender - пол человека
#married - в браке или нет
#dependents - количество иждивенцев
#education - образование
#self_employed - самзанятый или нет
#applicantincome - доход
#coapplicantincome - сопутствующий доход
#loanamount - сумма кредита
#loan_amount_term - срок кредита
#credit_history - кредитная история
#property_area - местность проживания
#loan_status - статус кредита

###Разберемся с типами данных

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             601 non-null    object 
 1   married            611 non-null    object 
 2   dependents         599 non-null    object 
 3   education          614 non-null    object 
 4   self_employed      582 non-null    object 
 5   applicantincome    614 non-null    int64  
 6   coapplicantincome  614 non-null    float64
 7   loanamount         592 non-null    float64
 8   loan_amount_term   600 non-null    float64
 9   credit_history     564 non-null    float64
 10  property_area      614 non-null    object 
 11  loan_status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


###Проверим df на наличие дубликатов и пропусков

In [82]:
#Проверим наличие дубликатов
df.duplicated().sum()

0

In [83]:
#Проверим наличие пропусков
df.isnull().sum()

gender               13
married               3
dependents           15
education             0
self_employed        32
applicantincome       0
coapplicantincome     0
loanamount           22
loan_amount_term     14
credit_history       50
property_area         0
loan_status           0
dtype: int64

In [84]:
# Столбец gender имеет пустые значения, что может значить, что человек просто забыл поставить свой пол
df = df.drop(df.loc[df.gender.isnull() == True].index)

# Столбец married имеет пустые значения, что может значить, что человек просто забыл поставить свой статус брака
df = df.drop(df.loc[df.married.isnull() == True].index)

#В столбце dependents есть пустые значения, можно предположить, что у человека нет иждивенцев, поэтому данное значение не прописано.
#Соответственно, можно заменить пустые значения на 0
df.dependents.fillna(0,inplace=True)

# Столбец self_employed имеет пустые значения, что может значить, что человек просто забыл поставить самозанятый он или нет
df = df.drop(df.loc[df.self_employed.isnull() == True].index)

#В столбце loanamount и loan_amount_term есть пустые значения, можно предположить, что человек не брал кредит, поэтому данное значение не прописано.
#Соответственно, можно заменить пустые значения на 0
df.loanamount.fillna(0,inplace=True)
df.loan_amount_term.fillna(0,inplace=True)

# Столбец credit_history имеет пустые значения, что может значить, что человеку не проставили данный признак
df = df.drop(df.loc[df.credit_history.isnull() == True].index)

In [85]:
#Пропусков не осталось
df.isnull().sum()

gender               0
married              0
dependents           0
education            0
self_employed        0
applicantincome      0
coapplicantincome    0
loanamount           0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

In [86]:
#Столбец credit_history содержит значения '0' и '1', предлагаю поменять тип
df['credit_history'] = df['credit_history'].astype(int)
df['credit_history'] = df['credit_history'].astype(str)
df['credit_history'] = df['credit_history'].str.replace('1', 'Y')
df['credit_history'] = df['credit_history'].str.replace('0', 'N')

###Рассмотрим категориальные и числовые столбцы

In [87]:
cat_columns = []
num_columns = []

for i in df.columns:
    if (df[i].dtypes == object):
        cat_columns +=[i]
    else:
        num_columns +=[i]

print('Категориальные данные:\t ',cat_columns, '\n Число столблцов = ',len(cat_columns))

print('Числовые данные:\t ',  num_columns, '\n Число столблцов = ',len(num_columns))

Категориальные данные:	  ['gender', 'married', 'dependents', 'education', 'self_employed', 'credit_history', 'property_area', 'loan_status'] 
 Число столблцов =  8
Числовые данные:	  ['applicantincome', 'coapplicantincome', 'loanamount', 'loan_amount_term'] 
 Число столблцов =  4


## Рассмотрим основную статистику нашего df

In [88]:
df.describe()

Unnamed: 0,applicantincome,coapplicantincome,loanamount,loan_amount_term
count,523.0,523.0,523.0,523.0
mean,5299.533461,1555.581109,138.175908,333.774379
std,5513.955756,2540.108821,82.344522,83.372053
min,150.0,0.0,0.0,0.0
25%,2892.0,0.0,97.0,360.0
50%,3858.0,1086.0,125.0,360.0
75%,5816.5,2253.5,165.0,360.0
max,81000.0,33837.0,600.0,480.0


Небольшие вывод - есть слишком высокие значения applicantincome	и coapplicantincome относительно 75 процентиля (возможно выбросы)

##Корреляция числовых признаков

In [89]:
cm = sns.color_palette("coolwarm", as_cmap=True)
df[num_columns].corr().style.background_gradient(cmap=cm,
                                    vmin = -1,
                                    vmax=1)

Unnamed: 0,applicantincome,coapplicantincome,loanamount,loan_amount_term
applicantincome,1.0,-0.114236,0.46391,0.014496
coapplicantincome,-0.114236,1.0,0.191443,-0.008222
loanamount,0.46391,0.191443,1.0,0.063639
loan_amount_term,0.014496,-0.008222,0.063639,1.0


У нас нет взаимозависимых факторов

# Гистограммы распределений

## Числовые признаки

In [90]:
def plot_hist_plotly_num(df, col_names, rows = 2, cols = 2 ):
    fig = make_subplots(rows=2, cols=2,
                        subplot_titles= (col_names))
    for idx, column_name in enumerate(col_names):
        fig.add_trace(
            go.Histogram(x=df[column_name],
                nbinsx = 20),
        row=idx//2+1, col=idx%2+1)
    fig.update_layout(height=1000, width=1000)
    fig.show()

In [91]:
plot_hist_plotly_num(df, num_columns)

Небольшой вывод - есть выбросы у applicantincome и coapplicantincome

Посмотрим на "ящик с усами" для этих значений и очистим их

In [92]:
fig_income = px.box(df,
                    x='applicantincome',
                    y='loan_status',
                    color="loan_status",
                    title='Loan_Status vs ApplicantIncome')
fig_income.show()

In [93]:
Q1 = df['applicantincome'].quantile(0.25)
Q3 = df['applicantincome'].quantile(0.75)
IQR = Q3 - Q1

#Определим нижную и верхнюю границу и очистим
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['applicantincome'] >= lower_bound) & (df['applicantincome'] <= upper_bound)]

In [94]:
fig_coapplicant_income = px.box(df,
                                x='coapplicantincome',
                                y='loan_status',
                                color="loan_status",
                                title='Loan_Status vs CoapplicantIncome')
fig_coapplicant_income.show()

In [95]:
Q1 = df['coapplicantincome'].quantile(0.25)
Q3 = df['coapplicantincome'].quantile(0.75)
IQR = Q3 - Q1

#Определим нижную и верхнюю границу и очистим
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['coapplicantincome'] >= lower_bound) & (df['coapplicantincome'] <= upper_bound)]

## Категориальные признаки

In [96]:
def plot_hist_plotly_cat(df, col_names, rows = 4, cols = 2 ):
    fig = make_subplots(rows=4, cols=2,
                        subplot_titles= (col_names))
    for idx, column_name in enumerate(col_names):
        fig.add_trace(
            go.Histogram(x=df[column_name],
                nbinsx = 20),
        row=idx//2+1, col=idx%2+1)
    fig.update_layout(height=1000, width=1000)
    fig.show()

In [97]:
plot_hist_plotly_cat(df, cat_columns,rows = 4, cols = 2)

# Обучение модели

## Pipeline

In [98]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.target_mean = {}

    def fit(self, X, y):
      if self.cols is None:
          self.cols = X.columns
      y_series = pd.Series(y)
      for col in self.cols:
          self.target_mean[col] = {}
          X_copy = X.copy()
          X_copy['target'] = y_series
          self.target_mean[col] = X_copy.groupby(col)['target'].mean().to_dict()
      return self


    def transform(self, X):
        for col in self.cols:
            X[col] = X[col].map(self.target_mean[col])
            X[col] = X[col].fillna(np.mean(X[col]))
        return X

In [99]:
#Исходя из распределений решил выбрать степенное преобразование и стандартизацию
# Но так как мы будем использовать Случайный лес, то нет смысла преобразовывать данные

num_pipe_applicantincome = Pipeline([
    ('power', None)
])
num_applicantincome = ['applicantincome']

num_pipe_coapplicantincome = Pipeline([
    ('power', None)
])
num_coapplicantincome = ['coapplicantincome']

num_pipe_loanamount = Pipeline([
    ('scaler', None)
])
num_loanamount = ['loanamount']

num_pipe_loan_amount_term = Pipeline([
    ('scaler', None)
])
num_loan_amount_term = ['loan_amount_term']

In [100]:
df[cat_columns].nunique()

gender            2
married           2
dependents        5
education         2
self_employed     2
credit_history    2
property_area     3
loan_status       2
dtype: int64

In [101]:
#Для категориальных решил использовать ordinal encoder и ohe, исходя из количества значений
cat_pipe_gender = Pipeline([
    ('encoder', OrdinalEncoder())
])
cat_gender = ['gender']

cat_pipe_married = Pipeline([
    ('encoder', OrdinalEncoder())
])
cat_married = ['married']

cat_pipe_dependents = Pipeline([
    ('encoder', TargetEncoder())
])
cat_dependents = ['dependents']

cat_pipe_education = Pipeline([
    ('encoder', OrdinalEncoder())
])
cat_education = ['education']

cat_pipe_self_employed = Pipeline([
    ('encoder', OrdinalEncoder())
])
cat_self_employed = ['self_employed']

cat_pipe_credit_history = Pipeline([
    ('encoder', OrdinalEncoder())
])
cat_credit_history = ['credit_history']

cat_pipe_property_area = Pipeline([
    ('encoder', TargetEncoder())
])
cat_property_area = ['property_area']

In [102]:
preprocessors_all = ColumnTransformer(transformers=[
    ('num_applicantincome', num_pipe_applicantincome, num_applicantincome),
    ('num_coapplicantincome', num_pipe_coapplicantincome, num_coapplicantincome),
    ('num_loanamount', num_pipe_loanamount, num_loanamount),
    ('num_loan_amount_term', num_pipe_loan_amount_term, num_loan_amount_term),
    ('cat_gender', cat_pipe_gender, cat_gender),
    ('cat_married', cat_pipe_married, cat_married),
    ('cat_dependents', cat_pipe_dependents, cat_dependents),
    ('cat_education', cat_pipe_education, cat_education),
    ('cat_self_employed', cat_pipe_self_employed, cat_self_employed),
    ('cat_credit_history', cat_pipe_credit_history, cat_credit_history),
    ('cat_property_area', cat_pipe_property_area, cat_property_area)
])

## Случайный Лес для Классификации

In [103]:
X, y = df.drop(columns = ['loan_status']), df['loan_status']

In [106]:
# Преобразуем наши классы
Label = LabelEncoder()
Label.fit(y)
Label.classes_

array(['N', 'Y'], dtype=object)

In [107]:
target = Label.transform(y)

In [108]:
# разбиваем на тренировочную и валидационную
X_train, X_val, y_train, y_val = train_test_split(X, target,
                                                    test_size=0.3,
                                                    random_state=42)

In [109]:
pipe_all = Pipeline([
    ('preprocessors', preprocessors_all),
    ('model', RandomForestClassifier(random_state = 42))
    ])

In [110]:
pipe_all.fit(X_train, y_train)

## Кросс валидация

In [124]:
def cross_validation (X, y, model, scoring, cv_rule):
    scores = cross_validate(model,X, y,
                      scoring=scoring, cv=cv_rule )
    print('Ошибка на кросс-валидации')
    DF_score = pd.DataFrame(scores)
    display(DF_score)
    print('\n')
    print(DF_score.mean()[2:])

In [125]:
scoring_clf = {'ACC': 'accuracy',
           'F1': 'f1',
           'Precision': 'precision',
           'Recall': 'recall'}

In [126]:
cross_validation (X_train, y_train,
                  pipe_all,
                  scoring_clf,
                  StratifiedKFold(n_splits=5, shuffle= True, random_state = 42))

Ошибка на кросс-валидации


Unnamed: 0,fit_time,score_time,test_ACC,test_F1,test_Precision,test_Recall
0,0.31664,0.036314,0.772727,0.842105,0.784314,0.909091
1,0.330431,0.04591,0.815385,0.866667,0.829787,0.906977
2,0.33313,0.039195,0.830769,0.886598,0.796296,1.0
3,0.328312,0.03808,0.815385,0.869565,0.833333,0.909091
4,0.293644,0.032189,0.738462,0.824742,0.754717,0.909091




test_ACC          0.794545
test_F1           0.857935
test_Precision    0.799690
test_Recall       0.926850
dtype: float64


В целом переобучения нет

## Метрики модели

In [112]:
y_pred = pipe_all.predict(X_val)

# Гармоническое среднее между точностью и полнотой
f1 = f1_score(y_val, y_pred, average='weighted')
# Доля правильно классифицированных образцов
accuracy = accuracy_score(y_val, y_pred)
# Доля истинно положительных предсказаний среди всех положительных предсказаний.
precision = precision_score(y_val, y_pred, average='weighted')
# Доля истинно положительных предсказаний, найденных моделью
recall = recall_score(y_val, y_pred, average='weighted')

print('F1 score:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)

F1 score: 0.7867922726078754
Accuracy: 0.8014184397163121
Precision: 0.7928176664531892
Recall: 0.8014184397163121


In [113]:
conf = confusion_matrix(y_val, y_pred)
conf

array([[19, 21],
       [ 7, 94]])

В целом ошибок не так уж много, но есть проблема с предсказаниями для тех у кого loan_status = N

# Улучшение модели

In [115]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [5, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

In [119]:
# Создаем GridSearchCV
grid_search = GridSearchCV(pipe_all, param_grid, cv=5, n_jobs=-1, scoring='f1')

# Обучаем GridSearch на обучающих данных
grid_search.fit(X_train, y_train)

In [120]:
best_params = grid_search.best_params_
print("Лучшие параметры:", best_params)

Лучшие параметры: {'model__max_depth': 5, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'model__n_estimators': 50}


## Кросс валидация

In [128]:
cross_validation (X_train, y_train,
                  best_model,
                  scoring_clf,
                  StratifiedKFold(n_splits=5, shuffle= True, random_state = 42))

Ошибка на кросс-валидации


Unnamed: 0,fit_time,score_time,test_ACC,test_F1,test_Precision,test_Recall
0,0.285654,0.033238,0.818182,0.877551,0.796296,0.977273
1,0.267409,0.034141,0.830769,0.88172,0.82,0.953488
2,0.275683,0.095646,0.8,0.868687,0.767857,1.0
3,0.368377,0.065637,0.846154,0.895833,0.826923,0.977273
4,0.378457,0.078536,0.8,0.871287,0.77193,1.0




test_ACC          0.819021
test_F1           0.879016
test_Precision    0.796601
test_Recall       0.981607
dtype: float64


## Метрики модели

In [121]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_val)

f1 = f1_score(y_val, y_pred, average='weighted')
# Доля правильно классифицированных образцов
accuracy = accuracy_score(y_val, y_pred)
# Доля истинно положительных предсказаний среди всех положительных предсказаний.
precision = precision_score(y_val, y_pred, average='weighted')
# Доля истинно положительных предсказаний, найденных моделью
recall = recall_score(y_val, y_pred, average='weighted')

conf = confusion_matrix(y_val, y_pred)

print('F1 score:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print(conf)

F1 score: 0.7994892021451785
Accuracy: 0.8226950354609929
Precision: 0.8350956743095967
Recall: 0.8226950354609929
[[17 23]
 [ 2 99]]


Немного улучшили метрики

По матрице ошибок - выросли ошибки первого уровня (предсказываем N, когда по факту Y, возможно есть пролблемы с балансом классов)