In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
import optuna
from sklearn.base import clone  
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

# Setting pandas display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
x_test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
original_data = pd.read_csv('/kaggle/input/original-data/credit_risk_dataset.csv~/credit_risk_dataset.csv')

# Filling missing values
original_data['person_emp_length'] = original_data['person_emp_length'].fillna(original_data['person_emp_length'].median())
original_data['loan_int_rate'] = original_data['loan_int_rate'].fillna(original_data['loan_int_rate'].median())

df = pd.concat([df, original_data], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0.0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1.0,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2.0,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3.0,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4.0,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [4]:
def feature_engineering(df):
    
    # Feature Encoding
    df['person_home_ownership'] = df['person_home_ownership'].map({
        'RENT': 0, 
        'MORTGAGE': 1,
        'OWN': 3,
        'OTHER': 4
    })
    df['loan_intent'] = df['loan_intent'].map({
        'EDUCATION': 0,
        'MEDICAL': 1,             
        'VENTURE': 2,             
        'PERSONAL': 3,             
        'DEBTCONSOLIDATION': 4,    
        'HOMEIMPROVEMENT' :5
    })
    df['loan_grade'] = df['loan_grade'].map({
        'A': 0,    
        'B': 1,  
        'C': 2,    
        'D': 3,     
        'E': 4,     
        'F': 5,     
        'G': 6       
    })
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({
        'N': 0,
        'Y': 1
    })

    # converting category data types to int
    df['person_home_ownership'] =  df['person_home_ownership'].astype(int)
    df['loan_intent'] = df['loan_intent'].astype(int)
    df['loan_grade'] =   df['loan_grade'].astype(int)
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype(int)
    
    # New features
    df['income_to_age_ratio'] = df['person_income'] / df['person_age']
    df['Employment_length_to_age_ratio'] = df['person_emp_length'] / df['person_age']
    df['income_loan_ratio'] = df['person_income'] / df['loan_amnt']
    df['loan_interest_amount'] = df['loan_amnt'] * df['loan_int_rate']
    df['loan_credit_hist_ratio'] = df['loan_amnt'] / df['cb_person_cred_hist_length']
    df['loan_income_interest_interaction'] = df['loan_percent_income'] * df['loan_int_rate']

    return df
    

In [5]:
df = feature_engineering(df)
test = x_test.drop(columns=['id'])
test = feature_engineering(test)

X = df.drop(columns=['id', 'loan_status'])
Y = df['loan_status']

In [6]:
# def objective(trial):
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 300),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),  
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 6), 
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),  
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1), 
#         'gamma': trial.suggest_float('gamma', 0, 2), 
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 5.0)  
#     }

    
#     Xgb_model = XGBClassifier(eval_metric='auc', random_state=42, **params)
#     auc_scores = []
    
#     kfold = StratifiedKFold(n_splits=5, shuffle=True)
#     for train_index, test_index in kfold.split(X, Y):
#         model = clone(Xgb_model)
        
#         X_train, X_cv = X.iloc[train_index], X.iloc[test_index]
#         Y_train, Y_cv = Y.iloc[train_index], Y.iloc[test_index]
    
#         model.fit(X_train, Y_train)

#         predict_proba = model.predict_proba(X_cv)[:, 1]
#         score = roc_auc_score(Y_cv, predict_proba)
#         auc_scores.append(score)
    
#     return np.mean(auc_scores)

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)  # Adjust trials according to need

# best_params = study.best_params
# print(f'Best Parameters: {best_params}')


# model = XGBClassifier(
#     n_estimators=best_params['n_estimators'],
#     learning_rate=best_params['learning_rate'],
#     max_depth=best_params['max_depth'],
#     colsample_bytree=best_params['colsample_bytree'],
#     min_child_weight=best_params['min_child_weight'],
#     subsample=best_params['subsample'],
#     reg_lambda=best_params['reg_lambda'],
#     gamma=best_params['gamma'],  
#     scale_pos_weight=best_params['scale_pos_weight'],
#     random_state=42
# )

# model.fit(X,Y)

# pred = model.predict(test)
# result = pd.DataFrame({
#     'id': x_test['id'],
#     'loan_status': pred
# })
# result.to_csv('predictions.csv', index=False)

In [7]:

# Updated XGBClassifier with new parameters
model = XGBClassifier(
    max_depth=6,
    n_estimators=365,
    learning_rate=0.07910843677328683,
    colsample_bytree=0.7127414343531729,
    min_child_weight=8,
    subsample=0.9806063180496154,
    reg_lambda=0.35389413020007154,
    gamma=0.4740398195562466,
    scale_pos_weight=3.756051729328178,
    random_state=42
)



model.fit(X,Y)

pred = model.predict(test)
result = pd.DataFrame({
    'id': x_test['id'],
    'loan_status': pred
})
result.to_csv('predictions.csv', index=False)