In [1]:
# Basic Import
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
# Modelling
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier



In [2]:
df = pd.read_csv('data/Loan_Data (1).csv')

In [3]:
df = df.dropna()

In [4]:
X = df.drop(columns=['Loan_Status','Loan_ID'])  
y = df['Loan_Status']  

In [5]:
print(y.unique())

['N' 'Y']


In [6]:
y = y.map({'N':0, 'Y':1})

In [7]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
threshold = 3

train_means = X_train[num_features].mean()
train_stds = X_train[num_features].std()

z_scores_train = (X_train[num_features] - train_means) / train_stds

outlier_mask_train = (np.abs(z_scores_train) <= threshold).all(axis=1)

X_train_cleaned = X_train[outlier_mask_train]
y_train_cleaned = y_train[outlier_mask_train]

z_scores_test = (X_test[num_features] - train_means) / train_stds

outlier_mask_test = (np.abs(z_scores_test) <= threshold).all(axis=1)

X_test_cleaned = X_test[outlier_mask_test]
y_test_cleaned = y_test[outlier_mask_test]

In [10]:


num_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
  
  [
    ("OneHotEncoder", oh_transformer, cat_features),
    ("StandardScaler", num_transformer, num_features),
  ]

)

In [11]:
preprocessor.fit(X_train_cleaned)

X_train_transformed = preprocessor.transform(X_train_cleaned)
X_test_transformed = preprocessor.transform(X_test_cleaned)

In [12]:
pca = PCA(n_components=0.95)  
X_train_pca = pca.fit_transform(X_train_transformed)
X_test_pca = pca.transform(X_test_transformed)

In [13]:
svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train_pca, y_train_cleaned)

y_pred_svm = svm_model.predict(X_test_pca)

accuracy = accuracy_score(y_test_cleaned, y_pred_svm)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_cleaned, y_pred_svm))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_pred_svm))

Model Accuracy: 82.42%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.38      0.56        26
           1       0.80      1.00      0.89        65

    accuracy                           0.82        91
   macro avg       0.90      0.69      0.72        91
weighted avg       0.86      0.82      0.79        91


Confusion Matrix:
[[10 16]
 [ 0 65]]


In [14]:


# Initialize Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Fit the model
log_reg_model.fit(X_train_pca, y_train_cleaned)

# Make predictions
y_pred_log_reg = log_reg_model.predict(X_test_pca)

# Calculate accuracy and print the results
accuracy = accuracy_score(y_test_cleaned, y_pred_log_reg)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_cleaned, y_pred_log_reg))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_pred_log_reg))


Model Accuracy: 82.42%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.38      0.56        26
           1       0.80      1.00      0.89        65

    accuracy                           0.82        91
   macro avg       0.90      0.69      0.72        91
weighted avg       0.86      0.82      0.79        91


Confusion Matrix:
[[10 16]
 [ 0 65]]


In [15]:


# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Fit the model
rf_model.fit(X_train_pca, y_train_cleaned)

# Make predictions
y_pred_rf = rf_model.predict(X_test_pca)

# Calculate accuracy and print the results
accuracy = accuracy_score(y_test_cleaned, y_pred_rf)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_cleaned, y_pred_rf))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_pred_rf))


Model Accuracy: 83.52%

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.54      0.65        26
           1       0.84      0.95      0.89        65

    accuracy                           0.84        91
   macro avg       0.83      0.75      0.77        91
weighted avg       0.83      0.84      0.82        91


Confusion Matrix:
[[14 12]
 [ 3 62]]


In [16]:


# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Fit the model
gb_model.fit(X_train_pca, y_train_cleaned)

# Make predictions
y_pred_gb = gb_model.predict(X_test_pca)

# Calculate accuracy and print the results
accuracy = accuracy_score(y_test_cleaned, y_pred_gb)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_cleaned, y_pred_gb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_pred_gb))


Model Accuracy: 79.12%

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.50      0.58        26
           1       0.82      0.91      0.86        65

    accuracy                           0.79        91
   macro avg       0.75      0.70      0.72        91
weighted avg       0.78      0.79      0.78        91


Confusion Matrix:
[[13 13]
 [ 6 59]]


In [17]:


# Initialize K-Nearest Neighbors model
knn_model = KNeighborsClassifier()

# Fit the model
knn_model.fit(X_train_pca, y_train_cleaned)

# Make predictions
y_pred_knn = knn_model.predict(X_test_pca)

# Calculate accuracy and print the results
accuracy = accuracy_score(y_test_cleaned, y_pred_knn)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_cleaned, y_pred_knn))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_cleaned, y_pred_knn))


Model Accuracy: 82.42%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.38      0.56        26
           1       0.80      1.00      0.89        65

    accuracy                           0.82        91
   macro avg       0.90      0.69      0.72        91
weighted avg       0.86      0.82      0.79        91


Confusion Matrix:
[[10 16]
 [ 0 65]]
