In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import warnings
import joblib

warnings.filterwarnings('ignore')

url = 'https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Loan%20Application%20Status/loan_prediction.csv?raw=true'
data = pd.read_csv(url)

print("First few rows of the dataset:")
print(data.head())


print("\nBasic information about the dataset:")
print(data.info())
print("\nSummary statistics of the dataset:")
print(data.describe())

missing_values = data.isnull().sum()
print("\nMissing values in the dataset:")
print(missing_values[missing_values > 0])

plt.figure(figsize=(8, 6))
sns.countplot(data['Loan_Status'])
plt.title('Distribution of Loan Status')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(data['ApplicantIncome'], bins=30, kde=True)
plt.title('Distribution of Applicant Income')
plt.show()

plt.figure(figsize=(14, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['Married'].fillna(data['Married'].mode()[0], inplace=True)
data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)

label_encoder = LabelEncoder()
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

data.drop(['Loan_ID'], axis=1, inplace=True)

X = data.drop(['Loan_Status'], axis=1)
y = data['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{name} - Accuracy Score:', accuracy_score(y_test, y_pred))
    print(f'{name} - Classification Report:\n', classification_report(y_test, y_pred))
    print(f'{name} - Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
print('Best Model - Accuracy Score:', accuracy_score(y_test, y_pred))
print('Best Model - Classification Report:\n', classification_report(y_test, y_pred))
print('Best Model - Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

joblib.dump(best_model, 'best_model_loan_status.pkl')

print("Model saved successfully.")


First few rows of the dataset:
   LP001002  Male   No  0      Graduate No.1  5849     0.1  Unnamed: 8    360  \
0  LP001003  Male  Yes  1      Graduate   No  4583  1508.0       128.0  360.0   
1  LP001005  Male  Yes  0      Graduate  Yes  3000     0.0        66.0  360.0   
2  LP001006  Male  Yes  0  Not Graduate   No  2583  2358.0       120.0  360.0   
3  LP001008  Male   No  0      Graduate   No  6000     0.0       141.0  360.0   
4  LP001011  Male  Yes  2      Graduate  Yes  5417  4196.0       267.0  360.0   

     1  Urban  Y  
0  1.0  Rural  N  
1  1.0  Urban  Y  
2  1.0  Urban  Y  
3  1.0  Urban  Y  
4  1.0  Urban  Y  

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LP001002    613 non-null    object 
 1   Male        600 non-null    object 
 2   No          610 non-null    object 
 3   0           598

KeyError: 'Loan_Status'

<Figure size 800x600 with 0 Axes>