In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

In [31]:
data = pd.read_csv('knn34.csv')

In [32]:
# Identify and encode categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Split the data into features (X) and target (y)
X = data.drop('Credit_Score', axis=1)
y = data['Credit_Score']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to evaluate performance
def evaluate_performance(y_test, y_pred, cutoff, method):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    accuracy = accuracy_score(y_test, y_pred)
    error_rate = 1 - accuracy

    performance_measures = {
        'Method': method,
        'Cutoff': cutoff,
        'ErrorRate': error_rate,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Accuracy': accuracy
    }

    return performance_measures

# KNN
print("KNN")
cutoffs = [0.6, 0.7, 0.8]
result_knn = pd.DataFrame(columns=["Method", "Cutoff", "ErrorRate", "Sensitivity", "Specificity", "Accuracy"])

for cutoff in cutoffs:
    best_score = 0  # Initialize best_score to 0
    best_k = None

    for k in range(1, 101, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_scaled, y_train)
        y_pred_proba = knn.predict_proba(X_test_scaled)[:, 1]
        y_pred = [1 if prob >= cutoff else 0 for prob in y_pred_proba]
        score = 1 - sum(y_pred != y_test) / len(y_test)
        if score > best_score:  # Update condition to select the highest accuracy
            best_score = score
            best_k = k

    print(f"Best K: {best_k}")
    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_scaled, y_train)
    y_pred_proba = knn.predict_proba(X_test_scaled)[:, 1]
    y_pred = [1 if prob >= cutoff else 0 for prob in y_pred_proba]
    performance = evaluate_performance(y_test, y_pred, cutoff, 'KNN')
    result_row = pd.DataFrame([performance])
    result_knn = pd.concat([result_knn, result_row], ignore_index=True)

print(result_knn)

KNN
Best K: 5
Best K: 7
Best K: 1
  Method  Cutoff  ErrorRate  Sensitivity  Specificity  Accuracy
0    KNN     0.6   0.201884     0.797506     0.798740  0.798116
1    KNN     0.7   0.216892     0.678030     0.890648  0.783108
2    KNN     0.8   0.225034     0.791824     0.757713  0.774966
