In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/loan-approval-prediction-dataset/loan_approval_dataset.csv


# **Import Libraries**

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE

In [3]:
import sklearn
print(sklearn.__version__)  # Should print 1.4.2

1.2.2


# **Load Dataset**

In [5]:
df = pd.read_csv('/kaggle/input/loan-approval-prediction-dataset/loan_approval_dataset.csv')  # Replace with your actual file path

In [6]:
print(df.head())
print(df.info())
print(df.isnull().sum())

   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0                  1760000

# **Data Preprocessing**

In [8]:
for col in df.columns:
    print(f"'{col}'")

' no_of_dependents'
' education'
' self_employed'
' income_annum'
' loan_amount'
' loan_term'
' cibil_score'
' residential_assets_value'
' commercial_assets_value'
' luxury_assets_value'
' bank_asset_value'
' loan_status'


In [9]:
print(df.columns.tolist())

[' no_of_dependents', ' education', ' self_employed', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value', ' loan_status']


In [10]:
df.columns = df.columns.str.strip()

In [11]:
print(df.columns.tolist())

['no_of_dependents', 'education', 'self_employed', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'loan_status']


In [12]:
print(df['loan_status'].unique())

[' Approved' ' Rejected']


In [13]:
# Remove leading/trailing spaces
df['loan_status'] = df['loan_status'].str.strip()

# Map to binary values: Approved -> 1, Rejected -> 0
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0})

In [14]:
print(df['loan_status'].unique())

[1 0]


In [15]:
# Check for missing values
print(df.isnull().sum())


no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64


# **Encode Categorical Variables**

**We need to convert education and self_employed into numeric form using LabelEncoder:******

In [16]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Encode categorical columns
df['education'] = le.fit_transform(df['education'])  # Graduate = 0 or 1
df['self_employed'] = le.fit_transform(df['self_employed'])  # No = 0, Yes = 1


# **Define Features & Target**

In [17]:
# Features and Target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# **Split the Data**

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **Feature Scaling**

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# model training and evaluation

# **Train the Model**

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


# **Evaluate the Model**

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on test set
y_pred = model.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.977751756440281
Confusion Matrix:
 [[306  12]
 [  7 529]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       318
           1       0.98      0.99      0.98       536

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.98       854
weighted avg       0.98      0.98      0.98       854



In [25]:
import joblib

# **Save the model:**

In [26]:
# Save the model
joblib.dump(model, 'loan_approval_model.pkl')

['loan_approval_model.pkl']

In [27]:
# Example: Saving scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

# **Load the Model Later:**

In [29]:
# Load the model
loaded_model = joblib.load('loan_approval_model.pkl')

# Make predictions
predictions = loaded_model.predict(X_test)
