<a href="https://colab.research.google.com/github/Olanle/Project-005-Loan-Approval-Prediction-with-Hyperparameter-Tuning/blob/main/005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

df = pd.read_csv("/content/Loan Prediction.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
#Check for missing values
print(df.isnull().sum())

#Check unique values in categorical columns (optional)
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].unique()}")

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
Loan_ID: ['LP001002' 'LP001003' 'LP001005' 'LP001006' 'LP001008' 'LP001011'
 'LP001013' 'LP001014' 'LP001018' 'LP001020' 'LP001024' 'LP001027'
 'LP001028' 'LP001029' 'LP001030' 'LP001032' 'LP001034' 'LP001036'
 'LP001038' 'LP001041' 'LP001043' 'LP001046' 'LP001047' 'LP001050'
 'LP001052' 'LP001066' 'LP001068' 'LP001073' 'LP001086' 'LP001087'
 'LP001091' 'LP001095' 'LP001097' 'LP001098' 'LP001100' 'LP001106'
 'LP001109' 'LP001112' 'LP001114' 'LP001116' 'LP001119' 'LP001120'
 'LP001123' 'LP001131' 'LP001136' 'LP001137' 'LP001138' 'LP001144'
 'LP001146' 'LP001151' 'LP001155' 'LP001157' 'LP001164' 'LP001179'
 'LP001186' 'LP001194' 'LP001195' 'LP001197' 'LP001198' 'LP0011

In [6]:
#Define Features and Split Data

from sklearn.model_selection import train_test_split

# Define target variable (Loan_Status)
# Convert Y/N ‚Üí 1/0 if necessary
if df['Loan_Status'].dtype == object:
    df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# Define features (X) and target (y)
target = 'Loan_Status'
features = [col for col in df.columns if col not in ['Loan_ID', target]]

X = df[features]
y = df[target]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Target distribution in train:\n", y_train.value_counts(normalize=True))


Train shape: (491, 11)
Test shape: (123, 11)
Target distribution in train:
 Loan_Status
1    0.686354
0    0.313646
Name: proportion, dtype: float64


In [7]:
#Preprocessing

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

#Identify column types
num_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
cat_features = ['Gender', 'Married', 'Dependents', 'Education',
                'Self_Employed', 'Credit_History', 'Property_Area']

#Numeric transformer: handle missing values + scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Categorical transformer: handle missing + encode categories
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

#Combine numeric and categorical transformers
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

#(Optional) test preprocessing on sample data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("Transformed training shape:", X_train_prep.shape)
print("Transformed test shape:", X_test_prep.shape)


Transformed training shape: (491, 21)
Transformed test shape: (123, 21)


In [8]:
#STEP 4 ‚Äî Model Setup & Pipeline Integration

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

#Logistic Regression Pipeline
logreg_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

#Random Forest Pipeline
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

#XGBoost Pipeline
xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])


In [9]:
#Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV

#Define parameter grids for each model
param_grids = {
    'logreg': {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    'rf': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'xgb': {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
}

#Combine models with their names
models = {
    'logreg': logreg_pipe,
    'rf': rf_pipe,
    'xgb': xgb_pipe
}

#Dictionary to store best models
best_models = {}

for name, model in models.items():
    print(f"\nüîç Tuning {name.upper()} model...")
    grid_search = GridSearchCV(
        model,
        param_grids[name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    print(f"‚úÖ Best parameters for {name}: {grid_search.best_params_}")
    print(f"üìà Best CV Accuracy: {grid_search.best_score_:.4f}")
    best_models[name] = grid_search.best_estimator_



üîç Tuning LOGREG model...
‚úÖ Best parameters for logreg: {'classifier__C': 0.1, 'classifier__solver': 'liblinear'}
üìà Best CV Accuracy: 0.7984

üîç Tuning RF model...
‚úÖ Best parameters for rf: {'classifier__max_depth': 5, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
üìà Best CV Accuracy: 0.8004

üîç Tuning XGB model...
‚úÖ Best parameters for xgb: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
üìà Best CV Accuracy: 0.7984


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
#Model Evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#Evaluate all best models
for name, model in best_models.items():
    print(f"\nüìä Evaluating {name.upper()} Model")

    y_pred = model.predict(X_test)

    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



üìä Evaluating LOGREG Model
Accuracy:  0.8618
Precision: 0.8400
Recall:    0.9882
F1 Score:  0.9081

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.58      0.72        38
           1       0.84      0.99      0.91        85

    accuracy                           0.86       123
   macro avg       0.90      0.78      0.81       123
weighted avg       0.88      0.86      0.85       123

Confusion Matrix:
 [[22 16]
 [ 1 84]]

üìä Evaluating RF Model
Accuracy:  0.8537
Precision: 0.8317
Recall:    0.9882
F1 Score:  0.9032

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.55      0.70        38
           1       0.83      0.99      0.90        85

    accuracy                           0.85       123
   macro avg       0.89      0.77      0.80       123
weighted avg       0.87      0.85      0.84       123

Confusion Matrix:
 [[21 17]
 [ 1 84]]

üìä Evaluating XGB

In [12]:
# Save & Load Models

import joblib

# Assume the best model is XGBoost
final_model = best_models['logreg']


joblib.dump(final_model, 'loan_approval_model_joblib.pkl')
print("‚úÖ Model saved with Joblib")


‚úÖ Model saved with Joblib


In [17]:
loaded_model_joblib = joblib.load('loan_approval_model_joblib.pkl')
print("üîÅ Model loaded successfully with Joblib")

# Test one prediction
sample_pred = loaded_model_joblib.predict(X_test.iloc[:15])
print("üîç Sample Prediction:", sample_pred)

üîÅ Model loaded successfully with Joblib
üîç Sample Prediction: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 0]
