In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Step 1: Load the dataset
dataset = pd.read_csv("/Users/wegdan/Desktop/Team6.csv", low_memory=False)

# Step 2: Prepare the data
X = dataset.loc[:, ['longitude', 'latitude', 'police_force', 'number_of_vehicles', 'number_of_casualties', 'local_authority_district', 'local_authority_ons_district', 'local_authority_highway', 'first_road_number', 'speed_limit', 'pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities', 'light_conditions', 'road_surface_conditions', 'special_conditions_at_site', 'carriageway_hazards', 'did_police_officer_attend_scene_of_accident', 'trunk_road_flag', 'hour', 'second_road', 'week_number', 'first_road_class_A(M)', 'first_road_class_B', 'first_road_class_C', 'first_road_class_Motorway', 'first_road_class_Unclassified', 'road_type_One way street/Slip road', 'road_type_Roundabout', 'road_type_Single carriageway', 'junction_detail_Mini-roundabout', 'junction_detail_More than 4 arms (not roundabout)', 'junction_detail_Not at junction or within 20 metres', 'junction_detail_Other junction', 'junction_detail_Private drive or entrance', 'junction_detail_Roundabout', 'junction_detail_Slip road', 'junction_detail_T or staggered junction', 'junction_control_Auto traffic signal', 'junction_control_Give way or uncontrolled', 'junction_control_Stop sign', 'urban_or_rural_area_Unallocated', 'urban_or_rural_area_Urban', 'date_August', 'date_December', 'date_February', 'date_January', 'date_July', 'date_June', 'date_March', 'date_May', 'date_November', 'date_October', 'date_September', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday', 'period_of_day_evening', 'period_of_day_morning', 'period_of_day_night', 'moist', 'high_wind']]
y = dataset['accident_severity']

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Define the number of bootstrap iterations
n_iterations = 1

# Step 6: Perform bootstrap resampling and evaluate model performance
accuracy_scores = []
train_accuracy_scores = []  # Add a list to store training accuracy scores
confusion_matrices = []
roc_auc_scores = []
for _ in range(n_iterations):
    # Create a bootstrap sample
    X_boot, y_boot = resample(X_train_scaled, y_train)
    
    # Fit the KNN model
    model = KNeighborsClassifier(weights='uniform', p=1, n_neighbors=8, algorithm='brute')
    model.fit(X_boot, y_boot)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Calculate training accuracy score
    y_pred_train = model.predict(X_boot)
    train_accuracy = accuracy_score(y_boot, y_pred_train)
    train_accuracy_scores.append(train_accuracy)
    
    # Calculate confusion matrix
    confusion_matrices.append(confusion_matrix(y_test, y_pred))
    
    # Calculate ROC curve and AUC for each class
    classes = np.unique(y_test)
    roc_auc_scores_class = []
    
    for class_ in classes:
        y_test_binary = np.where(y_test == class_, 1, 0)
        y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
        fpr, tpr, _ = roc_curve(y_test_binary, y_pred_prob)
        roc_auc = auc(fpr, tpr)
        roc_auc_scores_class.append(roc_auc)
        
        # Plot the ROC curve
        plt.plot(fpr, tpr, label=f'Class {class_} vs Rest (AUC = {roc_auc:.2f})')
    
    roc_auc_scores.append(roc_auc_scores_class)

# Step 7: Calculate the mean and standard deviation of accuracy scores
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

# Step 8: Calculate the mean and standard deviation of training accuracy scores
mean_train_accuracy = np.mean(train_accuracy_scores)
std_train_accuracy = np.std(train_accuracy_scores)

# Step 9: Calculate the mean confusion matrix
mean_confusion_matrix = np.mean(confusion_matrices, axis=0)

# Step 10: Plot the mean confusion matrix as a figure
plt.figure(figsize=(8, 6))
sns.heatmap(mean_confusion_matrix, annot=True, cmap='Blues', fmt='.0f')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Mean Confusion Matrix for KNN')
plt.show()

# Step 11: Calculate the mean ROC curve and AUC for each class
roc_auc_scores_mean = np.mean(roc_auc_scores, axis=0)

# Step 12: Plot the mean ROC curves for each class
plt.figure()
for class_, roc_auc in zip(classes, roc_auc_scores_mean):
    plt.plot(fpr, tpr, label=f'Class {class_} vs Rest (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Plot the random classifier curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Step 13: Print the mean and standard deviation of accuracy scores
print("Accuracy_KNN:", accuracy_scores)
print("Mean Accuracy_KNN:", mean_accuracy)
print("Standard Deviation of Accuracy_KNN:", std_accuracy)

# Step 14: Print the mean and standard deviation of training accuracy scores
print("Train Accuracy_KNN:", train_accuracy_scores)
print("Mean Train Accuracy_KNN:", mean_train_accuracy)
print("Standard Deviation of Train Accuracy_KNN:", std_train_accuracy)
