Import Libraries

In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

Load Data

In [65]:
train = pd.read_csv("/content/Train_set.csv")
test = pd.read_csv("/content/Test_set.csv")


Preprocessing

In [66]:
target = 'default'
y = train[target]
X = train.drop(columns=['ID', target])
test_ids = test['ID']
test_input = test.drop(columns=['ID'])

# Handle missing values
X.fillna(-999, inplace=True)
test_input.fillna(-999, inplace=True)

# Label encoding for categorical columns
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    full_col_data = pd.concat([X[col], test_input[col]], axis=0).astype(str)
    le.fit(full_col_data)
    X[col] = le.transform(X[col].astype(str))
    test_input[col] = le.transform(test_input[col].astype(str))

FEATURE ENGINEERING

In [67]:
X['loan_to_income'] = X['loan_amnt'] / (X['annual_income'] + 1)
test_input['loan_to_income'] = test_input['loan_amnt'] / (test_input['annual_income'] + 1)

X['balance_to_limit'] = X['total_current_balance'] / (X['total_revolving_limit'] + 1)
test_input['balance_to_limit'] = test_input['total_current_balance'] / (test_input['total_revolving_limit'] + 1)

X['credit_utilization'] = X['revolving_balance'] / (X['total_revolving_limit'] + 1)
test_input['credit_utilization'] = test_input['revolving_balance'] / (test_input['total_revolving_limit'] + 1)

Feature Scaling

In [68]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_input)

Model Training with Stratified K-Fold

In [69]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(test_scaled))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, y)):
    print(f"Training fold {fold + 1}/5...")

    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.03,
        num_leaves=60,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=y.value_counts()[0]/y.value_counts()[1],
        reg_alpha=0.2,
        reg_lambda=0.3,
        min_child_samples=25,
        random_state=42,
        verbose=-1
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='binary_error',
        callbacks=[]
    )

    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    val_scores.append(acc)
    print(f"Fold {fold + 1} Accuracy: {acc:.4f}")

    test_preds += model.predict_proba(test_scaled)[:, 1] / kf.n_splits

print("\nAverage CV Accuracy:", np.mean(val_scores))

Training fold 1/5...
Fold 1 Accuracy: 0.8877
Training fold 2/5...
Fold 2 Accuracy: 0.8831
Training fold 3/5...
Fold 3 Accuracy: 0.8893
Training fold 4/5...
Fold 4 Accuracy: 0.8847
Training fold 5/5...
Fold 5 Accuracy: 0.8855

Average CV Accuracy: 0.8860948272801025


Final Thresholding and Submission

In [70]:
test_final = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    'ID': test_ids,
    'default': test_final
})

submission.to_csv("final_submission.csv", index=False)
print("✅ Submission file 'final_submission.csv' created successfully!")

✅ Submission file 'final_submission.csv' created successfully!
