In [None]:
# Step-by-Step Python Code for Credit Card Fraud Detection Using kNN

#1. Load and Explore Data
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("creditcard.csv")

# Check class distribution to understand imbalance
print("Class distribution:\n", data['Class'].value_counts())

#2. Scale Time and Amount Columns
# Scale only 'Time' and 'Amount' columns

scaler = StandardScaler()
data[['Time', 'Amount']] = scaler.fit_transform(data[['Time', 'Amount']])

#3. Train-Test Split
# Split data into features (X) and target (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

#4. Apply SMOTE to Handle Class Imbalance
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=123)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Verify class distribution after SMOTE
print("Class distribution after SMOTE:\n", np.bincount(y_train_balanced))

#5. Determine the Optimal k Value
# Determine the optimal k value by testing accuracy on different values of k
k_values = range(1, 21, 2)  # Odd values from 1 to 20
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_balanced, y_train_balanced)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Find the best k with the highest accuracy
best_k = k_values[np.argmax(accuracies)]
best_accuracy = max(accuracies)
print("Best k:", best_k, "with accuracy:", best_accuracy)

#6. Plot Accuracy vs. k Values
# Plot accuracy vs. k values to visualize the optimal k
plt.figure(figsize=(10, 6))
plt.plot(k_values, accuracies, marker='o', color='b')
plt.axvline(x=best_k, linestyle='--', color='g', label=f'Best k={best_k}')
plt.title("Accuracy vs. k in kNN")
plt.xlabel("k (Number of Neighbors)")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

#7. Train Final Model with Best k and Evaluate
# Train the final model using the optimal k

knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train_balanced, y_train_balanced)
y_pred_best = knn_best.predict(X_test)

#8. Calculate and Interpret Performance Metrics
# Generate the confusion matrix and calculate key metrics
conf_matrix = confusion_matrix(y_test, y_pred_best)
accuracy = accuracy_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {accuracy:.6f}")
print(f"Sensitivity (Recall): {recall:.6f}")
print(f"Specificity: {specificity:.6f}")
print(f"Precision: {precision:.6f}")

