In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
loan_data = pd.read_csv('train.csv')
loan_data.head(10)
#displaying first 10 rows


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [5]:
# Checking NaN values
print(loan_data.isna().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [7]:
# Fill NaN values with appropriate values
loan_data.fillna({
    'Gender': loan_data['Gender'].mode()[0],
    'Married': loan_data['Married'].mode()[0],
    'Dependents': loan_data['Dependents'].mode()[0],
    'Self_Employed': loan_data['Self_Employed'].mode()[0],
    'Credit_History': loan_data['Credit_History'].mode()[0],
    'LoanAmount': loan_data['LoanAmount'].median(),
    'Loan_Amount_Term': loan_data['Loan_Amount_Term'].mode()[0]
}, inplace=True)

# Checking any missing values after filling
print(loan_data.isna().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [8]:
# Feature engineering
loan_data['TotalIncome'] = loan_data['ApplicantIncome'] + loan_data['CoapplicantIncome']
loan_data['LoanAmount_to_Income_Ratio'] = loan_data['LoanAmount'] / loan_data['TotalIncome']

In [9]:
# Selecting features and target
features = ['TotalIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'LoanAmount_to_Income_Ratio']
X = loan_data[features]
y = LabelEncoder().fit_transform(loan_data['Loan_Status'])

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:

# Hyperparameter tuning using GridSearchCV
param_grid = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}
grid_search = GridSearchCV(LogisticRegression(max_iter=200), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



In [12]:
# Evaluating the tuned model
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

In [13]:
# Model evaluation
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
tuned_classification_rep = classification_report(y_test, y_pred_tuned)
tuned_confusion_mat = confusion_matrix(y_test, y_pred_tuned)

In [14]:
# Displaying results
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", tuned_accuracy)
print("Classification Report:\n", tuned_classification_rep)
print("Confusion Matrix:\n", tuned_confusion_mat)

Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7804878048780488
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.42      0.57        43
           1       0.76      0.97      0.85        80

    accuracy                           0.78       123
   macro avg       0.83      0.70      0.71       123
weighted avg       0.81      0.78      0.75       123

Confusion Matrix:
 [[18 25]
 [ 2 78]]



## Conclusion Report:

The purpose of this project was to build a predictive model to determine whether a loan would be approved based on various applicant features. We used Logistic Regression as the machine learning model and applied hyperparameter tuning to improve its performance.

The data preprocessing included handling missing values and feature engineering to derive important features such as 'TotalIncome' and 'LoanAmount_to_Income_Ratio'. These features were instrumental in improving the model's prediction accuracy.

The Logistic Regression model, after hyperparameter tuning, achieved an accuracy of 78.05%. While the model performed well in predicting loan approvals, it faced challenges in correctly identifying loan rejections, as indicated by the lower recall for Class 0 (Loan Not Approved). The best parameters found through hyperparameter tuning were: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}.

Overall, the model provided valuable insights into the factors influencing loan approval. However, there is still room for improvement. 
