In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [54]:
train_file = '../Data/train.csv'
test_file = '../Data/test.csv'

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

y = train_data.loan_paid_back
X = train_data.drop(columns=['loan_paid_back', 'id'])

X.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1


In [55]:
X.describe()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate
count,593994.0,593994.0,593994.0,593994.0,593994.0
mean,48212.202976,0.120696,680.916009,15020.297629,12.356345
std,26711.942078,0.068573,55.424956,6926.530568,2.008959
min,6002.43,0.011,395.0,500.09,3.2
25%,27934.4,0.072,646.0,10279.62,10.99
50%,46557.68,0.096,682.0,15000.22,12.37
75%,60981.32,0.156,719.0,18858.58,13.68
max,393381.74,0.627,849.0,48959.95,20.99


In [56]:
cols_with_missing = [col for col in X.columns if X[col].isnull().sum() > 0]
print(f"Columns with missing values: {cols_with_missing}")

Columns with missing values: []


In [57]:
X.drop(columns=['grade_subgrade', 'debt_to_income_ratio', 'interest_rate'], inplace=True)

In [58]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

cat_cols = [cname for cname in categorical_cols if X[cname].nunique() <= 10]

print(f"Categorical columns with low cardinality: {cat_cols}")

all_cols = numeric_cols + cat_cols

# print(X['grade_subgrade'].nunique())

Numeric columns: ['annual_income', 'credit_score', 'loan_amount']
Categorical columns: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
Categorical columns with low cardinality: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']


In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [60]:


OH_cat_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

OE_cat_transformer = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), cat_cols)
    ]
)

model = XGBRegressor(random_state=0)

OH_pipeline = Pipeline(steps=[
    ('preprocessor', OH_cat_transformer),
    ('model', model)
])

OE_pipeline = Pipeline(steps=[
    ('preprocessor', OE_cat_transformer),
    ('model', model)
])

In [61]:
estimators = [50, 100, 200, 300, 400]
learning_rates = [0.01, 0.05]

def evaluate_model(pipeline, X_train, y_train, X_valid, y_valid):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_valid)
    score = roc_auc_score(y_valid, y_pred)
    return score

In [62]:
for n_estimators in estimators:
    for learning_rate in learning_rates:
        OH_pipeline.set_params(model__n_estimators=n_estimators, model__learning_rate=learning_rate)
        OE_pipeline.set_params(model__n_estimators=n_estimators, model__learning_rate=learning_rate)
        
        oh_score = evaluate_model(OH_pipeline, X_train, y_train, X_valid, y_valid)
        oe_score = evaluate_model(OE_pipeline, X_train, y_train, X_valid, y_valid)
        
        print(f"n_estimators: {n_estimators}, learning_rate: {learning_rate} => OH AUC: {oh_score:.4f}, OE AUC: {oe_score:.4f}")

n_estimators: 50, learning_rate: 0.01 => OH AUC: 0.7881, OE AUC: 0.7865
n_estimators: 50, learning_rate: 0.05 => OH AUC: 0.7881, OE AUC: 0.7883
n_estimators: 100, learning_rate: 0.01 => OH AUC: 0.7881, OE AUC: 0.7868
n_estimators: 100, learning_rate: 0.05 => OH AUC: 0.7883, OE AUC: 0.7887
n_estimators: 200, learning_rate: 0.01 => OH AUC: 0.7882, OE AUC: 0.7881
n_estimators: 200, learning_rate: 0.05 => OH AUC: 0.7886, OE AUC: 0.7888
n_estimators: 300, learning_rate: 0.01 => OH AUC: 0.7881, OE AUC: 0.7886
n_estimators: 300, learning_rate: 0.05 => OH AUC: 0.7882, OE AUC: 0.7888
n_estimators: 400, learning_rate: 0.01 => OH AUC: 0.7881, OE AUC: 0.7886
n_estimators: 400, learning_rate: 0.05 => OH AUC: 0.7882, OE AUC: 0.7887


In [64]:
OE_pipeline.set_params(model__n_estimators=200, model__learning_rate=0.05)
OE_pipeline.fit(X, y)

preds = OE_pipeline.predict(test_data[all_cols])

submission = pd.DataFrame({
    'id': test_data['id'],
    'loan_paid_back': preds
})

submission.to_csv('./1st_submission.csv', index=False)