In [207]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mannwhitneyu
import lightgbm as lgbm, catboost as catb

import warnings
warnings.filterwarnings("ignore")

In [208]:
#  !код взят с последнего вебинара
def imputer_rfr(data, target_col):

    data = data.copy()
    
    features = data.columns
    
    data = data[features]
    
    train = data[~data[target_col].isna()]
    predict_data = data[data[target_col].isna()]

    X = train.drop(columns=target_col)
    y = train[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=32)
    
    model = RandomForestRegressor(n_estimators=100,
                                  max_depth=10,
                                  random_state=42,
                                  verbose=1)
    model.fit(X_train, y_train)
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    print(f"r2 на train: {r2(y_train, pred_train)}")
    print(f"r2 на test: {r2(y_test, pred_test)}")

    pred = model.predict(predict_data.drop(columns=target_col))

    data.loc[data[target_col].isna(), target_col] = list(pred)
    return model, data


def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)


def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))


In [209]:
TEST_PATH = 'kaggle/gb-data/test.csv'
TRAIN_PATH  = 'kaggle/gb-data/train.csv'

In [210]:
test_df = pd.read_csv(TEST_PATH)
df = pd.read_csv(TRAIN_PATH)

df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [211]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df.columns.drop(TARGET_NAME).tolist()
NUM_FEATURE_NAMES = df.columns.drop(['Home Ownership', 'Years in current job', 'Purpose',  'Term', 'Credit Default']).tolist()
CAT_FEATURE_NAMES = ['Home Ownership', 'Years in current job', 'Purpose',  'Term']
num_feature = 'Credit Score'
#median_income_by_debt = df.groupby('Monthly Debt Size', as_index=False).agg({'Annual Income':'median'})

In [212]:
def data_prepare(df):
    """
        Чистка DF
    """
    df.loc[df['Maximum Open Credit'] >= 99999999, 'Maximum Open Credit'] = df['Maximum Open Credit'].median()
    df.loc[df['Credit Score']>1000, 'Credit Score'] = df['Credit Score']/10
    median_loan_by_purpose = df.groupby('Purpose', as_index=False).agg({'Current Loan Amount':'median'})
    for purpose in median_loan_by_purpose['Purpose']:
        df.loc[(df['Current Loan Amount']==99999999) & (df['Purpose']==str(purpose)), 'Current Loan Amount'] = median_loan_by_purpose.loc[median_loan_by_purpose['Purpose']==purpose, 'Current Loan Amount']
    df.loc[df['Months since last delinquent'].isnull(), 'Months since last delinquent'] = 0
    bins = pd.cut(df['Monthly Debt'], [-1, 10000, 16000, 23000, 140000], labels = ['A','B','C','D'])
    df['Monthly Debt Size'] = bins
    df.loc[(df['Annual Income'].isnull()) & (df['Monthly Debt Size']=='A'), 'Annual Income'] = 835601.0
    df.loc[(df['Annual Income'].isnull()) & (df['Monthly Debt Size']=='B'), 'Annual Income'] = 971565.0
    df.loc[(df['Annual Income'].isnull()) & (df['Monthly Debt Size']=='C'), 'Annual Income'] = 1196848.0
    df.loc[(df['Annual Income'].isnull()) & (df['Monthly Debt Size']=='D'), 'Annual Income'] = 1740476.0
    df.loc[df['Years in current job'].isnull(), 'Years in current job'] = df['Years in current job'].mode()[0]
    df.loc[df['Bankruptcies'].isnull(), 'Bankruptcies'] = 0
    df.loc[df['Current Loan Amount'].isnull(), 'Current Loan Amount'] = df['Current Loan Amount'].median()
    df = pd.concat([df, pd.get_dummies(df['Home Ownership'])], axis=1)
    df['Years in current job'] = df['Years in current job'].replace({'< 1 year':0, '1 year':1, '2 years':2, '3 years':3, '4 years':4, '5 years':5, '6 years':6, '7 years':7,'8 years':8,'9 years':9,'10+ years':10})
    df.loc[df['Tax Liens'] > 0, 'Tax Liens'] = 1
    df.loc[df['Number of Credit Problems'] > 0, 'Number of Credit Problems'] = 1
    df.loc[df['Bankruptcies'] > 0, 'Bankruptcies'] = 1
    df.loc[(df['Purpose'] == 'debt consolidation') | (df['Purpose'] == 'other') | (df['Purpose'] == 'home improvements'), 'Purpose'] = 0
    df.loc[df['Purpose']!=0, 'Purpose'] = 1
    df['Purpose'] = df['Purpose'].astype('int64')
    df['Term'] = df['Term'].replace({'Short Term':0, 'Long Term':1})
   
    
    median_income_by_debt = df.groupby('Monthly Debt Size', as_index=False).agg({'Annual Income':'median'})
    return df

In [213]:
df = data_prepare(df)
df

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,...,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,Monthly Debt Size,Have Mortgage,Home Mortgage,Own Home,Rent
0,Own Home,482087.0,10,0.0,11.0,26.3,685960.0,1.0,0.0,1.0,...,265826.0,47386.0,7914.0,749.0,0,A,0,0,1,0
1,Own Home,1025487.0,10,0.0,15.0,15.3,1181730.0,0.0,0.0,0.0,...,264968.0,394972.0,18373.0,737.0,1,C,0,0,1,0
2,Home Mortgage,751412.0,8,0.0,11.0,35.0,1182434.0,0.0,0.0,0.0,...,265826.0,308389.0,13651.0,742.0,0,B,0,1,0,0
3,Own Home,805068.0,6,0.0,8.0,22.5,147400.0,1.0,0.0,1.0,...,121396.0,95855.0,11338.0,694.0,0,B,0,0,1,0
4,Rent,776264.0,8,0.0,13.0,13.6,385836.0,1.0,0.0,0.0,...,125840.0,93309.0,7180.0,719.0,0,A,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,Rent,402192.0,0,0.0,3.0,8.5,107866.0,0.0,0.0,0.0,...,129360.0,73492.0,1900.0,697.0,0,A,0,0,0,1
7496,Home Mortgage,1533984.0,1,0.0,10.0,26.5,686312.0,0.0,43.0,0.0,...,444048.0,456399.0,12783.0,741.0,1,B,0,1,0,0
7497,Rent,1878910.0,6,0.0,12.0,32.1,1778920.0,0.0,0.0,0.0,...,265826.0,477812.0,12479.0,748.0,0,B,0,0,0,1
7498,Home Mortgage,1740476.0,10,0.0,21.0,26.5,1141250.0,0.0,0.0,0.0,...,615274.0,476064.0,37118.0,,0,D,0,1,0,0


In [214]:
test_df = data_prepare(test_df)

In [215]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Home Ownership                2500 non-null   object  
 1   Annual Income                 2500 non-null   float64 
 2   Years in current job          2500 non-null   int64   
 3   Tax Liens                     2500 non-null   float64 
 4   Number of Open Accounts       2500 non-null   float64 
 5   Years of Credit History       2500 non-null   float64 
 6   Maximum Open Credit           2500 non-null   float64 
 7   Number of Credit Problems     2500 non-null   float64 
 8   Months since last delinquent  2500 non-null   float64 
 9   Bankruptcies                  2500 non-null   float64 
 10  Purpose                       2500 non-null   int64   
 11  Term                          2500 non-null   int64   
 12  Current Loan Amount           2500 non-null   fl

In [216]:
# группировка и посчет медиан для групп
# пояснение почему в data_prepare такие значения
median_income_by_debt = df.groupby('Monthly Debt Size', as_index=False).agg({'Annual Income':'median'})
median_income_by_debt

Unnamed: 0,Monthly Debt Size,Annual Income
0,A,835601.0
1,B,971565.0
2,C,1196848.0
3,D,1740476.0


In [217]:
df_copy = df.copy()
df_test_copy = test_df.copy()

In [218]:
feature_name = 'Credit Score'
df_copy[feature_name].isna().sum()

1557

In [219]:
df_copy = df_copy.drop(columns=['Home Ownership','Monthly Debt Size', 'Credit Default'])
df_test_copy = df_test_copy.drop(columns=['Home Ownership','Monthly Debt Size'])

In [220]:
score_predictor, df_copy = imputer_rfr(df_copy, feature_name)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 на train: 0.6541456187456303
r2 на test: 0.2921506449032347


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [221]:
predict_data = df_test_copy[df_test_copy[feature_name].isna()]

pred = score_predictor.predict(predict_data.drop(columns=feature_name))

df_test_copy.loc[df_test_copy[feature_name].isna(), feature_name] = list(pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [222]:
X = df_copy
y = df['Credit Default']

In [223]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42, stratify=y)
display(y_train.value_counts(normalize=True), y_valid.value_counts(normalize=True))

0    0.718209
1    0.281791
Name: Credit Default, dtype: float64

0    0.718384
1    0.281616
Name: Credit Default, dtype: float64

In [224]:
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]
disbalance

2.5487288135593222

In [225]:
X_train = df_copy
y_train = df['Credit Default']
X_test = df_test_copy

model_catb = catb.CatBoostClassifier(silent=True, max_depth=5,
                                      iterations=20, random_state=28,
                                     class_weights=[1, disbalance],
                                     eval_metric='F1',
                                     early_stopping_rounds=20,
                                     )

In [226]:
model_catb.fit(X_train, y_train)

evaluate_preds(model_catb, X_train, X_valid, y_train, y_valid)

TRAIN

              precision    recall  f1-score   support

           0       0.86      0.63      0.72      5387
           1       0.44      0.74      0.55      2113

    accuracy                           0.66      7500
   macro avg       0.65      0.68      0.64      7500
weighted avg       0.74      0.66      0.68      7500

TEST

              precision    recall  f1-score   support

           0       0.86      0.63      0.73      1778
           1       0.44      0.74      0.55       697

    accuracy                           0.66      2475
   macro avg       0.65      0.69      0.64      2475
weighted avg       0.74      0.66      0.68      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1128  650
1                183  514


In [227]:
y_test_preds = final_model.predict(X_test)
result = pd.DataFrame()
result['Id'] = test_df.index
result['Credit Default'] = pd.Series(y_test_preds)
result.values

array([[   0,    1],
       [   1,    1],
       [   2,    1],
       ...,
       [2497,    1],
       [2498,    1],
       [2499,    1]], dtype=int64)

In [228]:
result.to_csv('prediction_new.csv', index=False)