Loading data of Application for Loans

In [2]:
import pandas as pd

# Load the dataset from the Excel file
df = pd.read_excel('data_Loan.xlsx', sheet_name='Sheet1')



In [None]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Graduate,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,2284,1,0,0,0,0,3902,1666.0,109,333,1,3,Y
1,2287,2,0,0,1,0,1500,1800.0,103,333,0,2,N
2,2288,1,1,2,0,0,2889,0.0,45,180,0,1,N
3,2296,1,0,0,0,0,2755,0.0,65,300,1,3,N
4,2297,1,0,0,1,0,2500,20000.0,103,333,1,2,Y


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            247 non-null    int64  
 1   Gender             247 non-null    int64  
 2   Married            247 non-null    int64  
 3   Dependents         247 non-null    int64  
 4   Graduate           247 non-null    int64  
 5   Self_Employed      247 non-null    int64  
 6   ApplicantIncome    247 non-null    int64  
 7   CoapplicantIncome  247 non-null    float64
 8   LoanAmount         247 non-null    int64  
 9   Loan_Amount_Term   247 non-null    int64  
 10  Credit_History     247 non-null    int64  
 11  Property_Area      247 non-null    int64  
 12  Loan_Status        247 non-null    object 
dtypes: float64(1), int64(11), object(1)
memory usage: 25.2+ KB


In [4]:
df.describe()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Graduate,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
count,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0
mean,2544.161943,1.198381,0.643725,0.748988,0.744939,0.1417,5403.688259,1776.918704,152.62753,320.040486,0.753036,2.064777
std,302.300553,0.39959,0.47987,0.988629,0.43678,0.349451,6485.890426,3993.412132,89.516037,60.030399,0.432121,0.783373
min,1900.0,1.0,0.0,0.0,0.0,0.0,210.0,0.0,9.0,12.0,0.0,1.0
25%,2369.5,1.0,0.0,0.0,0.0,0.0,2752.5,0.0,100.0,333.0,1.0,1.0
50%,2560.0,1.0,1.0,0.0,1.0,0.0,3691.0,1250.0,130.0,333.0,1.0,2.0
75%,2784.5,1.0,1.0,1.0,1.0,0.0,5822.0,2241.0,176.5,333.0,1.0,3.0
max,2990.0,2.0,1.0,3.0,1.0,1.0,81000.0,41667.0,600.0,480.0,1.0,3.0


In [5]:

# Encode the Loan_Status column
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score

Feature Selection

In [10]:
# Feature Selection
X = df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']

# Define categorical and numerical columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Graduate', 'Self_Employed', 'Credit_History', 'Property_Area']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [12]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [13]:
# Create a pipeline with preprocessing and LightGBM
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42, verbose=-1))  # LightGBM classifier
])

Train and Test

In [14]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Train the Model
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)



In [18]:
# Evaluate the Model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.69      0.62        13
           1       0.88      0.81      0.85        37

    accuracy                           0.78        50
   macro avg       0.72      0.75      0.73        50
weighted avg       0.80      0.78      0.79        50

Accuracy: 0.78


Hypertuning

In [19]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'classifier__num_leaves': [31, 50, 100],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__n_estimators': [100, 200, 300]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Best Parameters and Model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate the Best Model
y_pred_best = best_model.predict(X_test)
print("Best Model Classification Report:")
print(classification_report(y_test, y_pred_best))
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))



Best Parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100, 'classifier__num_leaves': 31}
Best Model Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77        13
           1       0.92      0.92      0.92        37

    accuracy                           0.88        50
   macro avg       0.84      0.84      0.84        50
weighted avg       0.88      0.88      0.88        50

Best Model Accuracy: 0.88


