In [26]:
import pandas as pd
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score


In [27]:
df = pd.read_csv("/kaggle/input/loan-pay-back-dataset-2025-2026-credit-to-kaggle/train.csv")  # adjust name if needed
df.head()


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [28]:
logistic_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)

random_forest_model = RandomForestClassifier(
    random_state=42
)

xgboost_model = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)


In [29]:
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10]
    },
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [10, 20],
        "min_samples_split": [2, 5]
    },
    "XGBoost": {
        "n_estimators": [100, 200],
        "max_depth": [3, 6],
        "learning_rate": [0.05, 0.1]
    }
}


In [30]:
models = {
    "Logistic Regression": logistic_model,
    "Random Forest": random_forest_model,
    "XGBoost": xgboost_model
}


In [31]:
kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [32]:
results = []
best_models = {}

for model_name, model in models.items():
    print(f"\nüîπ Training {model_name}...")
    
    grid_search = GridSearchCV(
        model,
        param_grids[model_name],
        cv=kf,
        scoring="accuracy",
        n_jobs=-1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model
    
    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"‚úÖ {model_name} Accuracy: {accuracy:.4f}")
    
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Best Parameters": grid_search.best_params_
    })



üîπ Training Logistic Regression...
‚úÖ Logistic Regression Accuracy: 0.9009

üîπ Training Random Forest...




‚úÖ Random Forest Accuracy: 0.9020

üîπ Training XGBoost...
‚úÖ XGBoost Accuracy: 0.9041


In [33]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Best Parameters
0,Logistic Regression,0.900858,{'C': 0.01}
1,Random Forest,0.902019,"{'max_depth': 20, 'min_samples_split': 5, 'n_e..."
2,XGBoost,0.904115,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti..."


In [34]:
best_model_name = results_df.sort_values(
    "Accuracy",
    ascending=False
).iloc[0]["Model"]

best_model = best_models[best_model_name]

print("üèÜ Best Model Selected:", best_model_name)


üèÜ Best Model Selected: XGBoost


In [35]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

print("‚úÖ Best model and related files saved successfully")


‚úÖ Best model and related files saved successfully
