In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [2]:
#loading
data_path="../data/processed/processed_loan_data.csv"
df=pd.read_csv(data_path)
print(df.shape)

(45000, 16)


In [None]:
#spliting
X=df.drop(columns=["loan_status"])
y=df["loan_status"]
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=42,stratify=y)


In [None]:
#defining parameter grids for hyperparameter tuning
param_grids = { 
    "Logistic Regression": {
         "C": [0.01, 0.1, 1, 10],
           "solver": ["liblinear", "lbfgs"] 
           },
    "Random Forest": { 
        "n_estimators": [100, 200, 300],
         "max_depth": [None, 10, 20],
         "min_samples_split": [2, 5, 10]
         },
         "Gradient Boosting": {
              "n_estimators": [100, 200],
             "learning_rate": [0.01, 0.1, 0.2],
             "max_depth": [3, 5, 7] 
             },
         "XGBoost": { 
             "n_estimators": [100, 200], 
             "learning_rate": [0.01, 0.1, 0.2],
               "max_depth": [3, 5, 7] 
               }
                 }

In [None]:
#defining base models
base_models = { 
    "Logistic Regression": LogisticRegression(max_iter=1000),
     "Random Forest": RandomForestClassifier(random_state=42),
     "Gradient Boosting": GradientBoostingClassifier(random_state=42), 
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
     }

In [6]:
optimized_results={}
best_models={}


In [7]:
#hyperparameter tuning and model optimization   
for name, model in base_models.items(): 
    print(f"\n Optimizing {name}...") 
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring='f1', n_jobs=-1) 
    grid.fit(x_train, y_train) 
    best_model = grid.best_estimator_ 
    best_models[name] = best_model 
    y_pred = best_model.predict(x_test) 
    optimized_results[name] = { "Best Params": grid.best_params_, "Accuracy": accuracy_score(y_test, y_pred), "Precision": precision_score(y_test, y_pred), "Recall": recall_score(y_test, y_pred), "F1 Score": f1_score(y_test, y_pred) }




 Optimizing Logistic Regression...

 Optimizing Random Forest...

 Optimizing Gradient Boosting...

 Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
for name, model in best_models.items(): 
    filename = f"../models/optimized_{name.replace(' ', '_').lower()}_model.pkl" 
    joblib.dump(model, filename)

In [9]:
print(pd.DataFrame(optimized_results))

                          Logistic Regression  \
Best Params  {'C': 10, 'solver': 'liblinear'}   
Accuracy                             0.896556   
Precision                              0.7768   
Recall                                   0.75   
F1 Score                             0.763165   

                                                 Random Forest  \
Best Params  {'max_depth': None, 'min_samples_split': 2, 'n...   
Accuracy                                              0.929556   
Precision                                             0.899415   
Recall                                                   0.769   
F1 Score                                              0.829111   

                                             Gradient Boosting  \
Best Params  {'learning_rate': 0.2, 'max_depth': 5, 'n_esti...   
Accuracy                                              0.939333   
Precision                                             0.904338   
Recall                                      