<a href="https://colab.research.google.com/github/OlegV12/GoogleColab/blob/Credit-Default/credit_default_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error as mse, r2_score as r2
from xgboost import XGBClassifier
from sklearn import svm, linear_model
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import seaborn as sn
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.stats import mannwhitneyu
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier, Pool, cv
import hyperopt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Loadind datasets


In [None]:
TRAIN_DATASET_PATH = '../input/gb-credit-default/train.csv'
TEST_DATASET_PATH = '../input/gb-credit-default/test.csv'

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

In [None]:
train_df.shape[1] - 1 == test_df.shape[1]

# EDA

In [None]:
train_df['Credit Default'].value_counts()

In [None]:

plt.figure(figsize = (16, 8))

train_df['Credit Default'].hist(bins=30)
plt.ylabel('Count')
plt.xlabel('Credit Default')

plt.title('Target distribution')
plt.show()

In [None]:
train_df.info()

In [None]:
plt.figure(figsize = (25,20))

sns.set(font_scale=1.4)

corr_matrix = train_df.corr()
corr_matrix = np.round(corr_matrix, 2)
corr_matrix[np.abs(corr_matrix) < 0.3] = 0

sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='GnBu')

plt.title('Correlation matrix')
plt.show()

# Data preprocessing

In [None]:
# Fillna and aoutliers with GBR model
def imputer_rfr(data, target_col):
    data = data.copy()
    
    features = data.columns
    
    data = data[features]
    
    train = data[~data[target_col].isna()]
    predict_data = data[data[target_col].isna()]

    X = train.drop(columns=target_col)
    y = train[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=32)
    
    model = GradientBoostingRegressor(n_estimators=110,
                                  max_depth=8,
                                  random_state=42,
                                  loss='huber',
                                  learning_rate=0.2)
    model.fit(X_train, y_train)
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    pred = model.predict(predict_data.drop(columns=target_col))
    data.loc[data[target_col].isna(), target_col] = list(pred)
    return model, data

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.medians = None
        self.years_max_quantille = None
        self.credit_max_quantille = None
        self.loan_quantille = None

    def fit(self, df):
        self.medians = df.median()
        self.years_max_quantille = np.quantile(df['Years of Credit History'], q=0.95)
        self.credit_max_quantille = np.quantile(df['Maximum Open Credit'], q=0.95)
        self.loan_quantille = np.quantile(df['Current Loan Amount'], q=0.88)

                                               
    def transform(self, df):
        # 'Home Ownership'
        # Have Mortgage = Home Mortgage
        df.loc[df['Home Ownership'] == 'Have Mortgage', 'Home Ownership'] = 'Home Mortgage'       
        # Replace obj with numeric
        df['Ownership'] = df['Home Ownership'].copy()
        df = pd.get_dummies(df, columns=['Ownership'])
        df.replace({'Home Ownership': {'Own Home': 2, 
                                       'Home Mortgage': 1, 
                                       'Rent': 0}
                    }, inplace=True)


        # 'Years in current job' 
        # Replace obj with numeric
        df.replace({'Years in current job': 
                                            {'10+ years': 10,
                                             '9 years': 9, 
                                             '8 years': 8,
                                             '7 years': 7, 
                                             '6 years': 6,
                                             '5 years': 5,
                                             '4 years': 4,
                                             '3 years': 3,
                                             '2 years': 2,
                                             '1 year': 1,
                                             '< 1 year': 0,}}, inplace=True)
        # Fillna
        self.years_in_cur_job_median = df['Years in current job'].median()
        df['Years in current job'].fillna(self.years_in_cur_job_median, inplace=True)


        # 'Purpose'
        df.replace({'Purpose': 
                                            {'debt consolidation': 0,
                                             'other': 1, 
                                             'home improvements': 1,
                                             'take a trip': 1, 
                                             'buy a car': 1,
                                             'small business': 2,
                                             'business loan': 2,
                                             'wedding': 1,
                                             'educational expenses': 1,
                                             'buy house': 1,
                                             'medical bills': 1,
                                             'moving': 1,
                                             'major purchase': 1,
                                             'vacation':1,
                                             'renewable energy':1, }}, 
                   inplace=True)
        # Fillna
        df['Purpose'].fillna(1, inplace=True)


        # 'Term'
        df.replace({'Term': {'Short Term': 0, 
                             'Long Term': 1}}, inplace=True)
        

        # 'Months since last delinquent'
        # Fillna with number of credit history months
        df['Months since last delinquent'].fillna((df['Years of Credit History'] * 12), inplace=True)


        # 'Annual Income'
        df['Annual Income'].fillna(1, inplace=True)


        # 'Credit Score'
        df['Credit Score'].fillna(1, inplace=True)
        df.loc[df['Credit Score'] > 850, ['Credit Score']] = df['Credit Score'] / 10


        # 'Bankruptcies'
        df['Bankruptcies'].fillna(0, inplace=True)
        
        
        # 'Years of Credit History'
        df['Years of Credit History'].clip(lower=None, upper=self.years_max_quantille, inplace=True)

        
        # Fillna with GBR model
        df.loc[df['Annual Income'] == 1, ['Annual Income']] = np.nan
        model, df = imputer_rfr(df, 'Annual Income')

        df.loc[df['Credit Score'] == 1, ['Credit Score']] = np.nan
        model, df = imputer_rfr(df, 'Credit Score')


        df.loc[df['Current Loan Amount'] >= self.loan_quantille, ['Current Loan Amount']] = np.nan
        model, df = imputer_rfr(df, 'Current Loan Amount')


        df.loc[df['Maximum Open Credit'] > self.credit_max_quantille, ['Maximum Open Credit']] = np.nan
        model, df = imputer_rfr(df, 'Maximum Open Credit')

        self.months_since_deliq_quantille = np.quantile(df['Months since last delinquent'], q=0.92)
        df.loc[df['Months since last delinquent'] > self.months_since_deliq_quantille, ['Months since last delinquent']] = np.nan
        model, df = imputer_rfr(df, 'Months since last delinquent')
        
        
        # Fillna just in case
        df.fillna(self.medians, inplace=True)

        return df


In [None]:
class FeatureGenerator:
    def __init__(self):
        self.median = None
        self.ownership_rating = None
        self.job_years_rating = None
        self.credit_default_score_mode = None
        self.median_loan_by_purpose = None

    def fit(self, df):
        self.medians = df.median()
        self.ownership_rating = df.groupby(['Home Ownership'])['Credit Score'].agg('median').to_dict()
        self.job_years_rating = df.groupby(['Years in current job'])['Credit Score'].agg('median').to_dict()
        self.credit_default_score_mode = df.groupby(['Credit Default'])['Credit Score'].agg(pd.Series.mode)[1]
        self.median_loan_by_purpose = df.groupby(['Purpose'])['Current Loan Amount'].agg('median').to_dict()

    def transform(self, df):
        # 'Credit to income ratio'
        df['Credit to income ratio'] = np.round((df['Annual Income']/12) / df['Monthly Debt'], 3)
        df.loc[df['Credit to income ratio'] == np.inf, ['Credit to income ratio']] = df['Credit to income ratio'].median()
        
        # 'Loan to income ratio'
        df['Loan to income ratio'] = np.round((df['Annual Income']/12) / df['Current Loan Amount'], 3)
        df.loc[df['Loan to income ratio'] == np.inf, ['Loan to income ratio']] = df['Loan to income ratio'].median()
        
        # 'Ownership_credit_rating'
        df['Ownership_credit_rating'] = df['Home Ownership'].map(self.ownership_rating)

        # 'Years in job rating'
        df['Years in job rating'] = df['Years in current job'].map(self.job_years_rating)
        
        # 'Credit Default Score Delta'
        df['Credit Default Score Delta'] = abs(df['Credit Score'] - self.credit_default_score_mode)

        # 'Loan_by_term'
        df['Loan_by_term'] = df['Purpose'].map(self.median_loan_by_purpose)


        return df

# Data preprocessing

In [None]:
preprocessor = DataPreprocessor()
preprocessor.fit(train_df)
train_df = preprocessor.transform(train_df)
test_df = preprocessor.transform(test_df)


feature_gen = FeatureGenerator()
feature_gen.fit(train_df)
train_df = feature_gen.transform(train_df)
test_df = feature_gen.transform(test_df)



In [None]:
train_df.info()

In [None]:
y = pd.DataFrame(data=train_df['Credit Default'])
train_df.drop('Credit Default', axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.25, random_state=100, stratify=y)

# Model training and evaluation

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))


def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [None]:
disbalance = int(y_train.value_counts()[0]) / int(y_train.value_counts()[1])
disbalance

In [None]:
model_catb = CatBoostClassifier(silent=True, random_state=21,
                                    #  cat_features=CAT_FEATURE_NAMES,
                                    class_weights=[1, disbalance],
                                    allow_writing_files=False,
                                    learning_rate=0.2,
                                    max_depth=3,
                                    n_estimators=95,
                                    eval_metric='F1',
                                    reg_lambda=2.617721518987342,
                                    early_stopping_rounds=30,
                                    use_best_model=True,
                                    custom_metric=['Precision', 'Recall'],
                                    subsample=0.8,)
                                     
model_catb.fit(X_train, y_train, eval_set=(X_test, y_test))

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

In [None]:
# cv = StratifiedKFold(n_splits=4, random_state=21, shuffle=True)
# parameters = { 

#               'max_depth':[2, 3], 

#               'subsample':[0.8,], 

#               'n_estimators':[65,], 
#               'learning_rate':[0.1, 0.2, 0.3, 0.4,0.5, 0.6, 0.7, 0.8, 0.9], 


#               'reg_lambda': [2.617721518987342],
#               }

# gs = GridSearchCV(model_catb, parameters, 
#                   scoring='f1', # метрика 
#                   cv=cv,

#                   n_jobs=-1
#                   )
# gs.fit(X_train, y_train, eval_set=(X_test, y_test))

# gs.best_params_

In [None]:
f1_score(y_test, model_catb.predict(X_test))

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, 
                                       model_catb.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances

In [None]:
SAMPLE_PATH = '../input/gb-credit-default/sample_submission.csv'
submit = pd.read_csv(SAMPLE_PATH)
submit.head()

In [None]:
predictions = model_catb.predict(test_df)
predictions = predictions.astype('int32')

In [None]:
predictions

In [None]:
submit['Credit Default'] = predictions
submit.head()

In [None]:
submit.shape

In [None]:
submit.to_csv('xgb_submit.csv', index=False)