In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [18]:

# Load data from CSV file
df = pd.read_csv('new_loan_approval_data.csv')
# Display the first few rows of the dataframe
print(df.columns)


Index(['dependents', 'education', 'employment', 'annual_income', 'loan_amount',
       'loan_term', 'cibil_score', 'loan_status'],
      dtype='object')


In [3]:
# Check for missing values
print(df.isnull().sum())
df.head()

dependents       0
 education       0
employment       0
annual_income    0
 loan_amount     0
 loan_term       0
 cibil_score     0
 loan_status     0
dtype: int64


Unnamed: 0,dependents,education,employment,annual_income,loan_amount,loan_term,cibil_score,loan_status
0,2,Graduate,No,9600000,29900000,12,778,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,Rejected
2,3,Graduate,No,9100000,29700000,20,506,Rejected
3,3,Graduate,No,8200000,30700000,8,467,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,Rejected


In [19]:
df['education'] = df['education'].str.strip()
df['employment'] = df['employment'].str.strip()
df['loan_status'] = df['loan_status'].str.strip()


In [20]:
df['education'][1]

'Not Graduate'

In [21]:
# Initialize LabelEncoder
le = LabelEncoder()

# Encode categorical columns
df['education'] = le.fit_transform(df['education'])
df['employment'] = le.fit_transform(df['employment'])
df['loan_status'] = le.fit_transform(df['loan_status'])

# Check the transformed data
print(df.head())


   dependents  education  employment  annual_income  loan_amount  loan_term  \
0           2          0           0        9600000     29900000         12   
1           0          1           1        4100000     12200000          8   
2           3          0           0        9100000     29700000         20   
3           3          0           0        8200000     30700000          8   
4           5          1           1        9800000     24200000         20   

   cibil_score  loan_status  
0          778            0  
1          417            1  
2          506            1  
3          467            1  
4          382            1  


In [22]:
X = df[['dependents', 'education', 'employment', 'annual_income', 'loan_amount', 'loan_term', 'cibil_score']]
y = df['loan_status']

In [23]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [26]:
y_pred = model.predict(X_test_scaled)

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1843
           1       0.89      0.89      0.89      1157

    accuracy                           0.91      3000
   macro avg       0.91      0.91      0.91      3000
weighted avg       0.91      0.91      0.91      3000



In [28]:
import joblib

# Save the encoder
joblib.dump(le, '../models/label_encoder.joblib')
# Save the scaler
joblib.dump(scaler, '../models/scaler.joblib')
# Save the model
joblib.dump(model, '../models/logistic_regression_model.joblib')

['../models/logistic_regression_model.joblib']