Bank Loan Prediciton 

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Load datasets
data_train = pd.read_csv('loan-train.csv')
data_test = pd.read_csv('loan-test.csv')

# Display dataset information
print("Training Data Info:")
print(data_train.info())
print("Test Data Info:")
print(data_test.info())



Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366

In [35]:
# Handle missing values in training and test sets
data_train.fillna(data_train.mean(numeric_only=True), inplace=True)
data_train.fillna(data_train.mode().iloc[0], inplace=True)
data_test.fillna(data_test.mean(numeric_only=True), inplace=True)
data_test.fillna(data_test.mode().iloc[0], inplace=True)

In [36]:
# Encode categorical variables
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
encoder = LabelEncoder()
for col in categorical_columns:
    if col in data_train.columns:
        data_train[col] = encoder.fit_transform(data_train[col])
    if col in data_test.columns:
        data_test[col] = encoder.transform(data_test[col])


In [37]:
# Encode target variable in training data
data_train['Loan_Status'] = encoder.fit_transform(data_train['Loan_Status'])

# Split datasets into features and target
X = data_train.drop(['Loan_Status', 'Loan_ID'], axis=1)
y = data_train['Loan_Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = data_test.drop(['Loan_ID'], axis=1)



In [38]:
# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [39]:
# Train and evaluate Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_val, rf_pred))




Random Forest Accuracy: 0.7723577235772358
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.42      0.56        43
           1       0.75      0.96      0.85        80

    accuracy                           0.77       123
   macro avg       0.81      0.69      0.70       123
weighted avg       0.79      0.77      0.75       123



In [40]:
print("Random Forest Predictions on Test Data:")
print(rf_pred)


Random Forest Predictions on Test Data:
[1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
 0 1 1 1 1 1 1 1 0 1 1 1]


In [41]:
# Train and evaluate SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_val)
svm_accuracy = accuracy_score(y_val, svm_pred)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_val, svm_pred))


SVM Accuracy: 0.7886178861788617
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



In [42]:
print("SVM Predictions on Test Data:")
print(svm_pred)



SVM Predictions on Test Data:
[1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1]


In [43]:
# Save the Random Forest model
import joblib
joblib.dump(rf_model, 'loan_prediction_rf_model.pkl')

# Save the SVM model
joblib.dump(svm_model, 'loan_prediction_svm_model.pkl')

['loan_prediction_svm_model.pkl']