In [6]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load data without header
df = pd.read_csv('loan_data.csv', header=None)

# assign column names
df.columns = ['Age', 'Annual_Income', 'Credit_Score', 'Loan_Amount', 'Loan_Term', 'Employment_Type', 'Loan_Status']

# Check if the data looks correct
print("Cleaned Columns:", df.columns)
print(df.head())

# Separate target and features
X = df.drop('Loan_Status', axis=1)

# Identify numerical and categorical columns
numerical_features = ['Age', 'Annual_Income', 'Credit_Score', 'Loan_Amount', 'Loan_Term']
categorical_features = ['Employment_Type']

# Handle missing values using SimpleImputer (mean for numerical, mode for categorical)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Standard scaling for numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding for categorical features
])

# Combine preprocessing for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Handling Class Imbalance using class weights (use in KNN and Decision Trees)
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(zip(np.unique(y), class_weights))

# KNN Classifier
knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

# Train KNN Model
knn.fit(X_train, y_train)

# Make Predictions
y_pred_knn = knn.predict(X_test)

# Evaluate KNN Model
print("\nKNN Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

# Decision Tree Classifier
dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(class_weight=class_weights_dict, random_state=42))
])

# Train Decision Tree Model
dt.fit(X_train, y_train)

# Make Predictions
y_pred_dt = dt.predict(X_test)

# Evaluate Decision Tree Model
print("\nDecision Tree Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# Compare Models Based on Accuracy
print("\nComparison of KNN vs Decision Tree:")
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred_knn)}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt)}")


Cleaned Columns: Index(['Age', 'Annual_Income', 'Credit_Score', 'Loan_Amount', 'Loan_Term',
       'Employment_Type', 'Loan_Status'],
      dtype='object')
   Age  Annual_Income  Credit_Score  Loan_Amount  Loan_Term Employment_Type  \
0   28            6.5           720            5          5        Salaried   
1   45           12.0           680           10         10   Self-Employed   
2   35            8.0           750            6          7        Salaried   
3   50           15.0           640           12         15   Self-Employed   
4   30            7.0           710            5          5        Salaried   

   Loan_Status  
0            0  
1            1  
2            0  
3            1  
4            0  

KNN Model Evaluation:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/neighbors/_classification.py", line 239, in fit
    return self._fit(X, y)
           ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/neighbors/_base.py", line 478, in _fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: ' Salaried'
