# Importing Packages and Inicial Data Preview

In [None]:
# Importing packages

import pandas as pd
import numpy as np

from pprint import pprint

import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from itertools import combinations
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,precision_recall_fscore_support, make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

In [2]:
# loading dataset and handle a subset of it

df = pd.read_csv("US_Accidents_March23_sampled_500k.csv")

# Preprocessing

In [12]:
# Droping columns that are not relevant for the model

columns = ['ID','Source','End_Lat','End_Lng','End_Time','Start_Time','Description','Airport_Code','Country','Weather_Timestamp',
           'Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Timezone','Wind_Direction','Pressure(in)','Zipcode',
           'Precipitation(in)','Humidity(%)','Wind_Chill(F)','Temperature(F)','Sunrise_Sunset','Street','County',
           'State','City']
df1 = df.drop(columns=columns)
df1.head(5)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Visibility(mi),Wind_Speed(mph),Weather_Condition,Amenity,Bump,Crossing,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Duration(min),Year,Month,Weekday,Day,Hour
0,0,30.641211,-91.153481,0.0,10.0,5.0,Clear,False,False,False,...,False,False,True,False,45.033333,2019,6,2,12,10
1,0,38.990562,-77.39907,0.056,10.0,5.0,Clear,False,False,False,...,False,False,False,False,139.65,2022,12,5,3,23
2,0,34.661189,-120.492822,0.022,10.0,13.0,Clear,False,False,False,...,False,False,True,False,129.75,2022,8,5,20,13
3,0,43.680592,-92.993317,1.054,10.0,15.0,Snow,False,False,False,...,False,False,False,False,120.316667,2022,2,0,21,17
4,0,35.395484,-118.985176,0.046,10.0,0.0,Clear,False,False,False,...,False,False,False,False,147.15,2020,12,4,4,1


### Preparing the data before and after the Data Splitting

#####  First Data Splitting

In [16]:
# Checking the class distribution before balancing
print("Before balancing:", Counter(df1['Severity']))

X = df1.drop(columns=['Severity'])
y = df1['Severity']

Before balancing: Counter({0: 402416, 1: 97584})


##### Balancing and Undersampling 

In [17]:
# Random Undersampling first to reduce dataset size

undersample = RandomUnderSampler(sampling_strategy=0.7, random_state=17)
X_resampled, y_resampled = undersample.fit_resample(X, y)

print("After Undersampling:", Counter(y_resampled))

After Undersampling: Counter({0: 139405, 1: 97584})


##### Data Splitting into Train and Test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

### Tomek Links

In [20]:
# Apply Tomek Links to get better class separation
# Try to not do this step and see if the results get better/worse

tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X_train_encoded, y_train)  

print("After Tomek Links:", Counter(y_train))

After Tomek Links: Counter({0: 111753, 1: 77838})


##### Standardized the Data

In [21]:
# Doing Standardization after splitting to avoid data leakage

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_tomek)
X_test_scaled = scaler.transform(X_test_encoded) 

##### PCA on the Data

In [22]:
# Using PCA 

pca = PCA(n_components=20) 

X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)

# Models Implementation

### Testing Hyperparameter Tuning With Optuna Optimizations

In [None]:
# Defining tuning optimization for SVM

def objective_svc(trial):
    params = {
        "C": trial.suggest_loguniform("C", 1e-3, 100),
        "gamma": trial.suggest_loguniform("gamma", 1e-4, 1e-1),
        "kernel": "rbf", "linear"
        "class_weight": "balanced",
        "probability": True
    }

    model = SVC(**params)

    scores = cross_validate(model, X_train_scaled2, y_resampled2, scoring=scoring, cv=cv)
    return scores["test_accuracy"].mean(), scores["test_precision"].mean(), scores["test_recall"].mean()

study_svc = optuna.create_study(directions=["maximize", "maximize", "maximize"], study_name="SVC")
study_svc.optimize(objective_svc, n_trials=30)

vis.plot_pareto_front(study_svc, target_names=["Accuracy", "Precision", "Recall"]).show()

Running GridSearch for Decision Tree...
Running GridSearch for KNN...
Running GridSearch for Naïve Bayes...
{'Decision Tree': {'Accuracy': np.float64(0.6949839885132318),
                   'Best Model': DecisionTreeClassifier(max_depth=11, min_samples_leaf=2),
                   'Best Parameters': {'max_depth': 11,
                                       'min_samples_leaf': 2,
                                       'min_samples_split': 2},
                   'F1-score': np.float64(0.6948744131047853),
                   'Precision': np.float64(0.6948729047893455),
                   'Recall': np.float64(0.6949839885132318)},
 'KNN': {'Accuracy': np.float64(0.727098730544232),
         'Best Model': KNeighborsClassifier(metric='euclidean', n_neighbors=19, weights='distance'),
         'Best Parameters': {'metric': 'euclidean',
                             'n_neighbors': 19,
                             'weights': 'distance'},
         'F1-score': np.float64(0.7265706639647359),
        

In [None]:
# Defining tuning optimization for RF

def objective_rf(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "class_weight": "balanced",
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)

    scores = cross_validate(model, X_train_scaled2, y_resampled2, scoring=scoring, cv=cv)
    return scores["test_accuracy"].mean(), scores["test_precision"].mean(), scores["test_recall"].mean()

study_rf = optuna.create_study(directions=["maximize", "maximize", "maximize"], study_name="RF")
study_rf.optimize(objective_rf, n_trials=30)

vis.plot_pareto_front(study_rf, target_names=["Accuracy", "Precision", "Recall"]).show()

In [None]:
# Defining tuning optimization for NN

def create_model(trial):
    model = Sequential()
    model.add(Dense(trial.suggest_int("units1", 64, 256), activation="relu", input_shape=(X_train_scaled2.shape[1],)))
    model.add(Dropout(trial.suggest_float("dropout1", 0.2, 0.5)))

    for i in range(trial.suggest_int("n_layers", 1, 3)):
        model.add(Dense(trial.suggest_int(f"units_{i}", 32, 128), activation="relu"))
        model.add(Dropout(trial.suggest_float(f"dropout_{i}", 0.2, 0.5)))

    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer=Adam(learning_rate=trial.suggest_loguniform("lr", 1e-4, 1e-2)),
                  loss="binary_crossentropy",
                  metrics=["accuracy", "precision", "recall"])
    return model

def objective_nn(trial):
    n_units1 = trial.suggest_int("n_units1", 32, 128)
    n_units2 = trial.suggest_int("n_units2", 16, 64)
    dropout1 = trial.suggest_float("dropout1", 0.2, 0.6)
    dropout2 = trial.suggest_float("dropout2", 0.2, 0.6)
    learning_rate = trial.suggest_loguniform("lr", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    epochs = 20

    model = KerasClassifier(
        lambda: create_model(n_units1, n_units2, dropout1, dropout2, learning_rate),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0
    )

    classes = np.unique(y_resampled2)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_resampled2)
    class_weights = dict(zip(classes, weights))

    scores = cross_validate(model, X_train_scaled2, y_resampled2,
                            scoring=scoring, cv=cv, fit_params={"class_weight": class_weights})

    return scores["test_accuracy"].mean(), scores["test_precision"].mean(), scores["test_recall"].mean()

study_nn = optuna.create_study(directions=["maximize", "maximize", "maximize"], study_name="NN")
study_nn.optimize(objective_nn, n_trials=30)

vis.plot_pareto_front(study_nn, target_names=["Accuracy", "Precision", "Recall"]).show()

In [None]:
# Best parameters for each model

best_trial_svm = max(study_svc.best_trials, key=lambda t: sum(t.values))
print("SVC Best Params:", best_trial_svm.params)
print("Accuracy:", best_trial_svm.values[0])
print("Precision:", best_trial_svm.values[1])
print("Recall:", best_trial_svm.values[2])

best_trial_rf = max(study_rf.best_trials, key=lambda t: sum(t.values))
print("SVC Best Params:", best_trial_rf.params)
print("Accuracy:", best_trial_rf.values[0])
print("Precision:", best_trial_rf.values[1])
print("Recall:", best_trial_rf.values[2])

best_trial_nn = max(study_nn.best_trials, key=lambda t: sum(t.values))
print("SVC Best Params:", best_trial_nn.params)
print("Accuracy:", best_trial_nn.values[0])
print("Precision:", best_trial_nn.values[1])
print("Recall:", best_trial_nn.values[2])

NameError: name 'KNeighborsClassifier' is not defined

In [None]:
# Best model and the parameters tuned
print("The best model obtained (SVC) and it's params:", best_trial_svc.params)

cm = confusion_matrix(y_test, y_pred_model)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.show()