In [14]:
# Fixed version of your code
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt

# 1. Feature selection

def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1 = test.fit(indep_X, dep_Y)
    selected_columns = indep_X.columns[fit1.get_support()]
    print("Selected top 5 features:", selected_columns.tolist())
    return fit1.transform(indep_X)

# 2. Split and scale
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# 3. Confusion matrix and accuracy

def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, X_test, y_test, cm

# 4. Classifiers with GridSearchCV where applicable

def logistic_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {
        'penalty': ['l2'],
        'class_weight': ['balanced'],
        'multi_class': ['ovr', 'auto'],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
    }
    grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

def svm_linear_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
    grid = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

def svm_NL_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

def Navie(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return cm_prediction(model, X_test, y_test)

def knn_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_neighbors': [5],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'metric': ['minkowski']
    }
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

def Decision_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2'],
        'splitter': ['best', 'random']
    }
    grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

def random_gridsearch(X_train, y_train, X_test, y_test):
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2'],
        'n_estimators': [10, 100]
    }
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid.fit(X_train, y_train)
    return cm_prediction(grid.best_estimator_, X_test, y_test)

# 5. Create results dataframe
def selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    df = pd.DataFrame(index=['ChiSquare'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    df.loc['ChiSquare'] = [acclog[0], accsvml[0], accsvmnl[0], accknn[0], accnav[0], accdes[0], accrf[0]]
    return df

# ==== Main Execution ====
dataset1 = pd.read_csv("Preprocessed_Loan_Default.csv")
df2 = pd.get_dummies(dataset1.copy(), drop_first=True)

indep_X = df2.drop('Loan_Status_Non-Default', axis=1)
dep_Y = df2['Loan_Status_Non-Default']

kbest = selectkbest(indep_X, dep_Y, 5)
X_train, X_test, y_train, y_test = split_scalar(kbest, dep_Y)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

classifier, Accuracy, report, _, _, _ = logistic_gridsearch(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)

classifier, Accuracy, report, _, _, _ = svm_linear_gridsearch(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)

classifier, Accuracy, report, _, _, _ = svm_NL_gridsearch(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)

classifier, Accuracy, report, _, _, _ = knn_gridsearch(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)

classifier, Accuracy, report, _, _, _ = Navie(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)

classifier, Accuracy, report, _, _, _ = Decision_gridsearch(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)

classifier, Accuracy, report, _, _, _ = random_gridsearch(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

result = selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
print("\nModel Accuracy Results:\n")
print(result)


Selected top 5 features: ['Income', 'Credit_Score', 'Existing_Loan_Balance', 'Loan_Amount', 'Interest_Rate']

Model Accuracy Results:

          Logistic    SVMl   SVMnl     KNN   Navie Decision  Random
ChiSquare   0.5136  0.7996  0.7996  0.7616  0.7996   0.6688  0.7956
