In [7]:
# ==========================================
# Loan Default Prediction using KNN
# ==========================================

import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------------------
# 1. Create Dataset
# ------------------------------------------
data = {
    "Age": [28,45,35,50,30,42,26,48,38,55],
    "Annual_Income": [6.5,12,8,15,7,10,5.5,14,9,16],
    "Credit_Score": [720,680,750,640,710,660,730,650,700,620],
    "Loan_Amount": [5,10,6,12,5,9,4,11,7,13],
    "Loan_Term": [5,10,7,15,5,10,4,12,8,15],
    "Employment_Type": [
        "Salaried","Self-Employed","Salaried","Self-Employed",
        "Salaried","Salaried","Salaried","Self-Employed",
        "Salaried","Self-Employed"
    ],
    "Default": [0,1,0,1,0,1,0,1,0,1]
}

df = pd.DataFrame(data)

# ------------------------------------------
# 2. Encode Categorical Feature
# ------------------------------------------
le = LabelEncoder()
df["Employment_Type"] = le.fit_transform(df["Employment_Type"])
# Salaried = 0, Self-Employed = 1

# ------------------------------------------
# 3. Feature / Target Split
# ------------------------------------------
X = df.drop("Default", axis=1)
y = df["Default"]

# ------------------------------------------
# 4. Feature Scaling (Important for KNN)
# ------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------------------------------
# 5. Train-Test Split (Class Imbalance Aware)
# ------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

# ------------------------------------------
# 6. Train KNN Model
# ------------------------------------------
knn = KNeighborsClassifier(n_neighbors=3, weights="distance")
knn.fit(X_train, y_train)

# ------------------------------------------
# 7. Identify High-Risk Customers
# ------------------------------------------
df["Predicted_Default"] = knn.predict(X_scaled)
df["Default_Probability"] = knn.predict_proba(X_scaled)[:, 1]

high_risk = df[df["Predicted_Default"] == 1]

print("\nHigh-Risk Customers:")
print(high_risk)

# ------------------------------------------
# 8. Pattern Analysis
# ------------------------------------------
print("Patterns leading to loan default:")
print("- Lower credit score (<680)")
print("- Higher loan amounts (>9 lakhs)")
print("- Longer loan terms (>10 years)")
print("- Self Employed customers show higher default frequency\n")

# ------------------------------------------
# 9. Credit Score & Income Influence
# ------------------------------------------
print("Influence of credit score and income:")
print("- Credit score is a strong negative indicator: lower scores increase default risk.")
print("- Higher income helps, but does not fully offset risk if loan amount and terms are high.\n")

# ------------------------------------------
# 10. Banking Policy Recommendation
# ------------------------------------------
print("Suggested bank policies:")
print("- Set stricter credit score threshholds for long term or high value loans.")
print("Require additional guarantees for Self-Employed applicants")
print("Cap loan amount on income-to-loan ratio.\n")

# ------------------------------------------
# 11. Compare KNN with Decision Tree
# ------------------------------------------
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)

print("\nKNN Performance:")
print("Accuracy:", accuracy_score(y_test, knn_pred))
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

print("\nDecision Tree Performance:")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

# --------------------------------------------
# 12. Loan Amount Dominates Distance Effect
# --------------------------------------------
print("Effect if loan amount dominates distance:")
print("- Model will focus mostly on loan size, ignoring credit score and income.")
print("This leads to biased predictions and reduced accuracy\n")

# ---------------------------------------------------
# 13. KNN in real time loan approval?
# ---------------------------------------------------
print("Should KNN be used in real time approval?")
print("- Not ideal due to slow prediction time on large data sets.")
print("Better alternatives: Decision Trees, Random Forests, Gradient Boosting.")




High-Risk Customers:
   Age  Annual_Income  Credit_Score  Loan_Amount  Loan_Term  Employment_Type  \
1   45           12.0           680           10         10                1   
3   50           15.0           640           12         15                1   
5   42           10.0           660            9         10                0   
7   48           14.0           650           11         12                1   
9   55           16.0           620           13         15                1   

   Default  Predicted_Default  Default_Probability  
1        1                  1                  1.0  
3        1                  1                  1.0  
5        1                  1                  1.0  
7        1                  1                  1.0  
9        1                  1                  1.0  
Patterns leading to loan default:
- Lower credit score (<680)
- Higher loan amounts (>9 lakhs)
- Longer loan terms (>10 years)
- Self Employed customers show higher default freque

In [None]:
# ==========================================
# Loan Default Prediction using KNN
# ==========================================

import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------------------
# 1. Create Dataset
# ------------------------------------------
data = {
    "Age": [28,45,35,50,30,42,26,48,38,55],
    "Annual_Income": [6.5,12,8,15,7,10,5.5,14,9,16],
    "Credit_Score": [720,680,750,640,710,660,730,650,700,620],
    "Loan_Amount": [5,10,6,12,5,9,4,11,7,13],
    "Loan_Term": [5,10,7,15,5,10,4,12,8,15],
    "Employment_Type": [
        "Salaried","Self-Employed","Salaried","Self-Employed",
        "Salaried","Salaried","Salaried","Self-Employed",
        "Salaried","Self-Employed"
    ],
    "Default": [0,1,0,1,0,1,0,1,0,1]
}

df = pd.DataFrame(data)

# ------------------------------------------
# 2. Encode Categorical Feature
# ------------------------------------------
le = LabelEncoder()
df["Employment_Type"] = le.fit_transform(df["Employment_Type"])
# Salaried = 0, Self-Employed = 1

# ------------------------------------------
# 3. Feature / Target Split
# ------------------------------------------
X = df.drop("Default", axis=1)
y = df["Default"]

# ------------------------------------------
# 4. Feature Scaling (Important for KNN)
# ------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ------------------------------------------
# 5. Train-Test Split (Class Imbalance Aware)
# ------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

# ------------------------------------------
# 6. Train KNN Model
# ------------------------------------------
knn = KNeighborsClassifier(n_neighbors=3, weights="distance")
knn.fit(X_train, y_train)

# ------------------------------------------
# 7. Identify High-Risk Customers
# ------------------------------------------
df["Predicted_Default"] = knn.predict(X_scaled)
df["Default_Probability"] = knn.predict_proba(X_scaled)[:, 1]

high_risk = df[df["Predicted_Default"] == 1]

print("\nHigh-Risk Customers:")
print(high_risk)

# ------------------------------------------
# 8. Pattern Analysis
# ------------------------------------------
print("Patterns leading to loan default:")
print("-Lower credit score (<680)")
print("-Higher loan amounts (>9 lakhs)")
print("-Longer loan terms (>10 years)")
print("-Self Employed customers show higher default frequency\n")

# ------------------------------------------
# 9. Credit Score & Income Influence
# ------------------------------------------
print("Influence of credit score and income:")
print("-Credit score is a strong negative indicator: lower scores increase default risk.")
print("-Higher income helps, but does not fully offset risk if loan amount and terms are high.\n")

# ------------------------------------------
# 10. Banking Policy Recommendation
# ------------------------------------------
print("Suggested bank policies:")
print("-Set stricter credit score threshholds for long term or high value loans.")
print("Require additional guarantees for Self-Employed applicants")
print("Cap loan amount on income-to-loan ratio.\n")

# ------------------------------------------
# 11. Compare KNN with Decision Tree
# ------------------------------------------
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)

print("\nKNN Performance:")
print("Accuracy:", accuracy_score(y_test, knn_pred))
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

print("\nDecision Tree Performance:")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))





High-Risk Customers:
   Age  Annual_Income  Credit_Score  Loan_Amount  Loan_Term  Employment_Type  \
1   45           12.0           680           10         10                1   
3   50           15.0           640           12         15                1   
5   42           10.0           660            9         10                0   
7   48           14.0           650           11         12                1   
9   55           16.0           620           13         15                1   

   Default  Predicted_Default  Default_Probability  
1        1                  1                  1.0  
3        1                  1                  1.0  
5        1                  1                  1.0  
7        1                  1                  1.0  
9        1                  1                  1.0  

Pattern Analysis (Averages):
          Age  Annual_Income  Credit_Score  Loan_Amount  Loan_Term  \
Default                                                              
0        




KNN Prediction Time: 1.619896650314331
Decision Tree Prediction Time: 0.41158342361450195
