In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/study-dir/loan_status_study_xgb.db
/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


In [2]:
import pandas as pd

print(f'reading the csv files into pandas dataframes.')
train_df = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
sub_df = pd.read_csv('/kaggle/input/playground-series-s4e10/sample_submission.csv')


reading the csv files into pandas dataframes.


In [3]:
# train_df.info()

In [4]:
# test_df.info()

In [5]:
# train_df.isnull().any(), test_df.isnull().any()

In [6]:
cat_cols = train_df.select_dtypes('object').columns
num_cols = []
for col in train_df.select_dtypes(exclude='object').columns:
    if col not in ('id', 'loan_status'):
        num_cols.append(col)
# num_cols, cat_cols

In [7]:
# train_df['loan_status'].value_counts(normalize=True)

In [8]:
import matplotlib.pyplot as plt

def box_plots_num_cols(df, columns):
    df = df.copy()
    base_width = 10
    base_height = 5
    rows = len(columns) 
    cols = 1
    fig_width = cols * base_width
    fig_height = rows * base_height
    fig, axes = plt.subplots(rows, cols, figsize=(fig_width, fig_height))
    axes = axes.flatten() if rows > 1 else [axes]
    for i, col in enumerate(columns):
        axes[i].boxplot(df[col])  
        axes[i].set_title(col)
    for j in range(i+1, len(axes)):
        axes[j].axis('off')
    plt.tight_layout()

In [9]:
# box_plots_num_cols(train_df, num_cols)

In [10]:
from scipy import stats
import numpy as np

def remove_outliers(df:pd.DataFrame(), cols, beta):
    df = df.copy()
    for col in cols:
#         print(f'removing {col} outliers:\n')
        data = df[col]
        iqr = stats.iqr(data)
        q1 = np.percentile(data, 25)
        q3 = np.percentile(data, 75)
        outlier_low = q1 - beta * iqr
        outlier_high = q1 + beta * iqr
#         print(f'outlier_low:{outlier_low}, outlier_high: {outlier_high}')
        df = df[(data >= outlier_low) & (data <= outlier_high)]
    return df


In [11]:
print(f'removing outliers before oversampling')
df_clean = remove_outliers(train_df, num_cols, 1.5)
# box_plots_num_cols(df_clean, num_cols)
df_clean.shape, train_df.shape

removing outliers before oversampling


((24802, 13), (58645, 13))

In [12]:
def hist_plot_cat_cols(df:pd.DataFrame(), cat_cols=cat_cols):
    df = df.copy()
    fig, axes_cat = plt.subplots(1, len(cat_cols), figsize=(20, 10))
    for i, col in enumerate(cat_cols):
        axes_cat[i].hist(df[col])
        axes_cat[i].set_title(col)

In [13]:
# hist_plot_cat_cols(train_df)

In [14]:
# hist_plot_cat_cols(df_clean)

In [15]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

print('encoding the categorical features')
df_clean = df_clean.drop(columns=['id'])

ord_enc = OrdinalEncoder()
one_hot_enc = OneHotEncoder(sparse_output=False)

ordinal_cols = ['loan_grade']
one_hot_cols = [col for col in cat_cols if col != 'loan_grade']
remaining_cols = [col for col in df_clean.columns if col not in cat_cols]

encoder = ColumnTransformer(
            transformers=[
                ('ordinal_encoder', ord_enc, ordinal_cols),
                ('one_hot_encoder', one_hot_enc, one_hot_cols),
                ('passthrough', 'passthrough', remaining_cols )
            ]
)

encoded_data = encoder.fit_transform(df_clean)
one_hot_enc.fit(df_clean[one_hot_cols])
one_hot_encoded_cols = one_hot_enc.get_feature_names_out(one_hot_cols)
all_cols = ordinal_cols + list(one_hot_encoded_cols) + remaining_cols
df_encoded = pd.DataFrame(encoded_data, columns=all_cols)
df_encoded.info()

encoding the categorical features
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24802 entries, 0 to 24801
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      24802 non-null  float64
 1   person_home_ownership_MORTGAGE  24802 non-null  float64
 2   person_home_ownership_OTHER     24802 non-null  float64
 3   person_home_ownership_OWN       24802 non-null  float64
 4   person_home_ownership_RENT      24802 non-null  float64
 5   loan_intent_DEBTCONSOLIDATION   24802 non-null  float64
 6   loan_intent_EDUCATION           24802 non-null  float64
 7   loan_intent_HOMEIMPROVEMENT     24802 non-null  float64
 8   loan_intent_MEDICAL             24802 non-null  float64
 9   loan_intent_PERSONAL            24802 non-null  float64
 10  loan_intent_VENTURE             24802 non-null  float64
 11  cb_person_default_on_file_N     24802 non-null  float64
 12

In [16]:
from imblearn.over_sampling import SMOTE, SVMSMOTE

print(f'oversampling using SVMSMOTE')
df_enc_copy = df_encoded.copy()

smote = SMOTE(random_state=32,  k_neighbors=10)
svm_smote = SVMSMOTE(random_state=32, k_neighbors=10,m_neighbors=10)

X = df_enc_copy.drop(columns=['loan_status'])
y = df_enc_copy.loc[:, 'loan_status']

X_smote, y_smote = smote.fit_resample(X, y)
X_svm_smote, y_svm_smote = svm_smote.fit_resample(X, y)
X_smote.shape, y_smote.shape, X_svm_smote.shape, y_svm_smote.shape

oversampling using SVMSMOTE


((46192, 20), (46192,), (46192, 20), (46192,))

In [17]:
smote_df = X_smote
smote_df['loan_status'] = y_smote

svm_smote_df = X_svm_smote
svm_smote_df['loan_status'] = y_svm_smote


In [18]:
# c = [col for col in smote_df.columns if col not in one_hot_encoded_cols]
# box_plots_num_cols(smote_df, c)

In [19]:
# c2 = [col for col in svm_smote_df.columns if col not in one_hot_encoded_cols]
# box_plots_num_cols(svm_smote_df, c2)

In [20]:
# smote_corr = smote_df.corr()
# svm_smote_corr = svm_smote_df.corr()

In [21]:
import seaborn as sns

def heatmap_corr(corr):
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()

In [22]:
# heatmap_corr(smote_corr)

In [23]:
# heatmap_corr(svm_smote_corr)

In [24]:
# high_smote_corr = smote_corr[(abs(smote_corr) > 0.7) & (abs(smote_corr) != 1.0)]
# heatmap_corr(high_smote_corr)

In [25]:
# high_svm_smote_corr = svm_smote_corr[(abs(svm_smote_corr) > 0.7) & (abs(svm_smote_corr) != 1.0)]
# heatmap_corr(high_svm_smote_corr)

In [26]:
print('dropping the column person home ownership rent')
df_final = svm_smote_df.drop(columns=[ 'person_home_ownership_RENT'])
df_final.info()

dropping the column person home ownership rent
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46192 entries, 0 to 46191
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      46192 non-null  float64
 1   person_home_ownership_MORTGAGE  46192 non-null  float64
 2   person_home_ownership_OTHER     46192 non-null  float64
 3   person_home_ownership_OWN       46192 non-null  float64
 4   loan_intent_DEBTCONSOLIDATION   46192 non-null  float64
 5   loan_intent_EDUCATION           46192 non-null  float64
 6   loan_intent_HOMEIMPROVEMENT     46192 non-null  float64
 7   loan_intent_MEDICAL             46192 non-null  float64
 8   loan_intent_PERSONAL            46192 non-null  float64
 9   loan_intent_VENTURE             46192 non-null  float64
 10  cb_person_default_on_file_N     46192 non-null  float64
 11  cb_person_default_on_file_Y     46192 non-null

In [27]:
from sklearn.model_selection import StratifiedShuffleSplit

print('splitting the dataset into train and test')
X = df_final.drop(columns=['loan_status'])
y = df_final.loc[:, 'loan_status']

sss = StratifiedShuffleSplit(n_splits=1, random_state=32, test_size=0.3)
for tr, te in sss.split(X, y):
    X_train, y_train = X.iloc[tr], y.iloc[tr]
    X_test, y_test = X.iloc[te], y.iloc[te]
X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

splitting the dataset into train and test


((46192, 19), (46192,), (32334, 19), (32334,), (13858, 19), (13858,))

In [None]:
import optuna
from sklearn.metrics import make_scorer, f1_score, fbeta_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import xgboost as xgb
from xgboost import XGBClassifier
import numpy as np

sss2 = StratifiedShuffleSplit(n_splits=20, test_size=0.1, random_state=32)

def objective(trial:optuna.Trial):

    params = {
        'learning_rate':trial.suggest_float('learning_rate', 0.1, 0.5, log=True),
        'n_estimators':trial.suggest_int('n_estimators', 10, 1000),
        'gamma':trial.suggest_int('gamma', 0, 5),
        'subsample':trial.suggest_float('subsample', 0.5, 1.0),
        'lambda':trial.suggest_float('lambda', 0.0, 1.0),
        'alpha':trial.suggest_float('alpha', 0.0, 1.0),
    }
    model = XGBClassifier(random_state=42, booster='gbtree', device='cuda', **params)
    pipe = Pipeline([
        ('poly', PolynomialFeatures(interaction_only=True, include_bias=False, degree=3)),
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    scoring = {
        'f_3_score':make_scorer(fbeta_score, beta=3),
        'precision_score':make_scorer(precision_score),
        'recall_score':make_scorer(recall_score)
    }
    scores = cross_validate(pipe, X_train, y_train, cv=sss2, scoring=scoring)
    f_3_score_mean = np.mean(scores['test_f_3_score'])
    precision_score_mean = np.mean(scores['test_precision_score'])
    recall_score_mean = np.mean(scores['test_recall_score'])
    return (f_3_score_mean, precision_score_mean, recall_score_mean)


In [None]:
# print(f'starting the hyperparameter tuning using optuna with 50 trials with  using scoring_metrics:f3, precision, recall')

# study_df_file_name = 'loan_status_study_xgb.db'
# study_name = 'loan_status_study_xgb'

# print(f'saving the study db file name: {study_df_file_name}')
# print(f'study name: {study_name}')

# study = optuna.create_study(
#             directions=['maximize', 'maximize', 'maximize'],
#             study_name=study_name,
#             storage=f'sqlite:///{study_df_file_name}',
#             load_if_exists=True,
            
# )
# study.optimize(objective, n_jobs=-1, n_trials=50, show_progress_bar=True)

In [29]:
import optuna
study = optuna.load_study(study_name='loan_status_study_xgb', storage='sqlite://///kaggle/input/study-dir/loan_status_study_xgb.db')

In [30]:
for trial in study.best_trials:
    best_params = trial.params
    print(f'values: {trial.values}')
    break
print('found the best params:\n')
best_params

values: [0.9648503130916449, 0.9898552074454754, 0.962152133580705]
found the best params:



{'learning_rate': 0.1549767193916382,
 'n_estimators': 960,
 'gamma': 0,
 'subsample': 0.6733475491016273,
 'lambda': 0.22093230483031567,
 'alpha': 0.3917641584711654}

In [31]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

print('scaling the X train')
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

final_model = XGBClassifier(random_state=42, verbosity=2, booster='gbtree', device='cuda', **best_params)
print(f'fitting the train data with the best hyperparams found from optuna.')
final_model.fit(X_train_scaled, y_train)

scaling the X train
fitting the train data with the best hyperparams found from optuna.




In [32]:
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, r2_score, precision_score, recall_score, fbeta_score
import pandas as pd

print(f'Calculating the scores on the test set: ')
X_test_scaled = scaler.transform(X_test)
y_pred = final_model.predict(X_test_scaled)

columns = ['f1', 'roc_auc',  'precision', 'recall', 'f_2_score', 'f_3_score']
scores = [f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred),  precision_score(y_test, y_pred), recall_score(y_test, y_pred), fbeta_score(y_test, y_pred, beta=2), fbeta_score(y_test, y_pred, beta=3)]
res_df = pd.DataFrame([scores], columns=columns)
res_df.head()

Calculating the scores on the test set: 


Unnamed: 0,f1,roc_auc,precision,recall,f_2_score,f_3_score
0,0.974258,0.974527,0.98467,0.964064,0.968116,0.966086


In [33]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


In [34]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


print('encoding the final test data set')
ord_enc_test = OrdinalEncoder()
one_hot_enc_test = OneHotEncoder(sparse_output=False)

test_cat_cols = test_df.select_dtypes('object').columns
ordinal_cols = ['loan_grade']
one_hot_cols = [col for col in test_cat_cols if col != 'loan_grade']
remaining_cols = [col for col in test_df.columns if col not in list(test_cat_cols)]

encoder_test = ColumnTransformer(
            transformers=[
                ('ordinal_encoder', ord_enc_test, ordinal_cols),
                ('one_hot_encoder', one_hot_enc_test, one_hot_cols),
                ('passthrough', 'passthrough', remaining_cols )
            ]
)

encoded_data = encoder_test.fit_transform(test_df)
one_hot_enc_test.fit(test_df[one_hot_cols])
one_hot_encoded_cols = one_hot_enc_test.get_feature_names_out(one_hot_cols)
all_cols = ordinal_cols + list(one_hot_encoded_cols) + remaining_cols
test_df_encoded = pd.DataFrame(encoded_data, columns=all_cols)
test_df_encoded.info()

encoding the final test data set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      39098 non-null  float64
 1   person_home_ownership_MORTGAGE  39098 non-null  float64
 2   person_home_ownership_OTHER     39098 non-null  float64
 3   person_home_ownership_OWN       39098 non-null  float64
 4   person_home_ownership_RENT      39098 non-null  float64
 5   loan_intent_DEBTCONSOLIDATION   39098 non-null  float64
 6   loan_intent_EDUCATION           39098 non-null  float64
 7   loan_intent_HOMEIMPROVEMENT     39098 non-null  float64
 8   loan_intent_MEDICAL             39098 non-null  float64
 9   loan_intent_PERSONAL            39098 non-null  float64
 10  loan_intent_VENTURE             39098 non-null  float64
 11  cb_person_default_on_file_N     39098 non-null  float64
 12 

In [35]:
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, r2_score, precision_score, recall_score, fbeta_score
import pandas as pd

test_df_encoded = test_df_encoded.drop(columns=['id', 'person_home_ownership_RENT'])
X_final_test_scaled = scaler.fit_transform(test_df_encoded)
y_pred = final_model.predict(X_final_test_scaled)

Calculating the scores on the final test set: 


In [36]:
test_2_df = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

print('preparing the final submission df')
final_submission_df = pd.DataFrame({
    'id':test_2_df['id'],
    'loan_status':y_pred
})

final_submission_df.head()

preparing the final submission df


Unnamed: 0,id,loan_status
0,58645,1
1,58646,1
2,58647,1
3,58648,1
4,58649,1


In [38]:
print(f'converting final submission df to csv file with name: final_submission.csv')
final_submission_df.to_csv('submission.csv', index=False)

converting final submission df to csv file with name: final_submission.csv
