# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.base import clone

from catboost import CatBoostClassifier, Pool

# Load Data

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv', index_col='id')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv', index_col='id')
orig_df = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')

In [None]:
train_df = pd.concat([train_df, orig_df])
train_df = train_df.reset_index(drop=True)

In [None]:
train_df.head()

In [None]:
train_df.info()

## Removing Duplicate Data

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.drop_duplicates(inplace=True)

## Handling Missing Values

In [None]:
train_df.isna().sum()

In [None]:
train_df[['loan_int_rate','person_emp_length']].describe()

In [None]:
train_df['loan_int_rate'] = train_df['loan_int_rate'].fillna(train_df['loan_int_rate'].median())
train_df['person_emp_length'] = train_df['person_emp_length'].fillna(train_df['person_emp_length'].median())

# EDA

In [None]:
target = 'loan_status'

In [None]:
features = train_df.drop(target, axis=1).columns.tolist()

In [None]:
categorical_features = train_df.select_dtypes(include='object').columns.tolist()

In [None]:
numerical_features = list(set(features) - set(categorical_features))

In [None]:
train_df.describe().T

In [None]:
train_df[categorical_features].describe(include='O').T

In [None]:
for col in categorical_features:
    display(train_df.groupby(col)[['loan_amnt', 'loan_int_rate']].agg(['min', 'mean', 'max']))

# Feature Distribution

In [None]:
def feature_distribution_plot(df, col, target_col=None):
    plt.figure(figsize=(14,6))
    plt.subplot(1,2,1)
    if df[col].dtype != 'object':
        sns.histplot(data=df, x=col, hue=target_col)
    else:
        sns.countplot(data=df, x=col, hue=target_col)
    plt.ylabel('Count')
    plt.xlabel(f'{col}')
    plt.title(f'Histogram of {col}')
    
    plt.subplot(1,2,2)
    if df[col].dtype != 'object':
        sns.boxplot(data=df, x=col, hue=target_col)
        plt.title(f'Boxplot of {col}')
        plt.ylabel('Count')
        plt.xlabel(f'{col}')
    else:
        df[col].value_counts().plot(kind='pie', autopct='%.0f%%',pctdistance=0.85,fontsize=12)
        plt.gca().add_artist(plt.Circle((0,0),radius=0.7,fc='white'))
        plt.title(f'Pie Chart of {col}')
        plt.xlabel('')
        plt.ylabel('')
    
    
    plt.tight_layout()
    plt.suptitle(f'Distribution of {col}', y=1.05, size=24, weight='bold')
    plt.show()

## Distribution of Numerical Features

In [None]:
for col in numerical_features:
    feature_distribution_plot(train_df, col, target)

In [None]:
plt.figure(figsize=(16, 12))

for i, col in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)
    sns.violinplot(train_df, x=target, y=col, hue=target, legend=None)

plt.tight_layout()
plt.show()

## Distribution of Categorical Features

In [None]:
for col in categorical_features:
    feature_distribution_plot(train_df, col, target)

## Target Distribution

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.countplot(train_df, x=target)

plt.subplot(1,2,2)
train_df[target].value_counts().plot.pie(autopct='%.0f%%',pctdistance=0.85,fontsize=12)
plt.gca().add_artist(plt.Circle((0,0),radius=0.7,fc='white'))

plt.tight_layout()
plt.show()

## Correlation Matrix

In [None]:
train_new = train_df.copy()

for col in categorical_features:
    train_new[col], _ = train_new[col].factorize()

cor_mat = train_new.corr()
mask = np.triu(cor_mat)

plt.figure(figsize=(10, 8))
sns.heatmap(cor_mat, fmt='.2f', annot=True, mask=mask, cmap='coolwarm')
plt.show()

# Model Training And Prediction

In [None]:
def convert_to_string(df):
    df_cat = df.copy()
    df_cat = df_cat.fillna(0)
    for col in features:
        df_cat[col] = df_cat[col].astype('string')
    return df_cat

In [None]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=99)

In [None]:
X = train_df.drop(target, axis=1)
y = train_df[target].ravel()

In [None]:
oof_preds = []
oof_aucs = []
oof_train_preds = np.zeros(len(y))

cat_params={
    'task_type'           : "CPU",
    'loss_function'       : 'Logloss',
    'eval_metric'         : "AUC",
    'bagging_temperature' : 0.25,
    'colsample_bylevel'   : 0.40,
    'iterations'          : 5_000,
    'learning_rate'       : 0.045,
    'max_depth'           : 5,
    'l2_leaf_reg'         : 0.80,
    'min_data_in_leaf'    : 30,
    'random_strength'     : 0.25,
    'random_state'        : 42,
    'early_stopping_rounds': 20,
    'use_best_model'       : True,
    'allow_writing_files' : False,
} 

X_cat = convert_to_string(X)
test_cat = convert_to_string(test_df)

test_pool = Pool(test_cat, cat_features=features)

for fold, (train_idx, test_idx) in enumerate(skfold.split(X_cat, y)):
    X_train, y_train = X_cat.iloc[train_idx], y[train_idx]
    X_test, y_test = X_cat.iloc[test_idx], y[test_idx]
    
    X_train_pool = Pool(X_train, y_train, cat_features=features)
    X_test_pool = Pool(X_test, y_test, cat_features=features)
    
    cat_clf = CatBoostClassifier(**cat_params)
    cat_clf = cat_clf.fit(X=X_train_pool,
                          eval_set=X_test_pool,
                          verbose=500,
                          early_stopping_rounds=20)
    oof_train_preds[test_idx] = cat_clf.predict_proba(Pool(X_test, cat_features=features))[:, 1]
    test_pred = cat_clf.predict_proba(test_pool)[:, 1]
    
    oof_preds.append(test_pred)
    auc = roc_auc_score(y_test, oof_train_preds[test_idx])
    oof_aucs.append(auc)
    print(f"\nFold {fold+1}--> ROC-AUC Score: {auc:.6f}\n")
    
    del X_train, y_train, X_test, y_test
    del X_train_pool, X_test_pool
    del cat_clf
    gc.collect()

auc_mean = np.mean(oof_aucs)
auc_std = np.std(oof_aucs)
print(f"\nAverage Fold ROC-AUC Score: {auc_mean:.6f} \xB1 {auc_std:.6f}\n")


test_pred_cat = np.mean(oof_preds, axis=0)

# Submission

In [None]:
sub = pd.read_csv('/kaggle/input/playground-series-s4e10/sample_submission.csv')

In [None]:
res = np.average([test_pred_cat], weights=[1], axis=0)

In [None]:
sub[target] = res
sub.to_csv('submission.csv', index=False)