In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    roc_auc_score
)
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path

data_path = Path("../data/processed/processed_data.csv")
accident_dfs = pd.read_csv(data_path)


# print(accident_dfs['MAN_COLL'].value_counts())
# print(f"\nMissing values:")
# #Check the missing values
# for col in pre_crash_features + [target]:
#     print(f"  {col}: {accident_dfs[col].isna().sum()}")
    


NameError: name 'pd' is not defined

## Prepare the dataframe

In [None]:
# df_model = accident_dfs[pre_crash_features + [target, 'STATENAME']].copy()
# df_model = df_model.dropna()
# print(f"Rows: {len(df_model)}")

## One-hot encode categorical features

In [None]:
from sklearn.compose import ColumnTransformer

accident_dfs['HOUR_SIN'] = np.sin(2 * np.pi * accident_dfs['HOUR'] / 24)
accident_dfs['HOUR_COS'] = np.cos(2 * np.pi * accident_dfs['HOUR'] / 24)
accident_dfs['DAY_SIN'] = np.sin(2 * np.pi * accident_dfs['DAY_WEEK'] / 7)
accident_dfs['DAY_COS'] = np.cos(2 * np.pi * accident_dfs['DAY_WEEK'] / 7)
accident_dfs['MONTH_SIN'] = np.sin(2 * np.pi * accident_dfs['MONTH'] / 12)
accident_dfs['MONTH_COS'] = np.cos(2 * np.pi * accident_dfs['MONTH'] / 12)

categorical_features = ['WEATHER', 'LGT_COND', 'FUNC_SYS', 'RUR_URB', 'MAN_COLL']
numeric_features = ['HOUR_SIN', 'HOUR_COS', 'DAY_SIN', 'DAY_COS', 'MONTH_SIN', 'MONTH_COS']
all_features = categorical_features + numeric_features  

df_model = accident_dfs[all_features + ['FATALS', 'STATENAME']].copy()
df_model = df_model.dropna()

df_model['SEVERITY'] = (df_model['FATALS'] > 1).astype(int)

test_states = ['Texas', 'Florida', 'Ohio']


train_df = df_model[~df_model['STATENAME'].isin(test_states)]
test_df = df_model[df_model['STATENAME'].isin(test_states)]

X_train = train_df[all_features]
X_test = test_df[all_features]
y_train = train_df['SEVERITY']
y_test = test_df['SEVERITY']


# Numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Combine with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')


# Full pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
    ))
])

clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)
y_prob = clf_pipeline.predict_proba(X_test)[:, 1]

print(f"\n{'='*50}")
print(f"  FATALITY SEVERITY PREDICTION — RESULTS")
print(f"{'='*50}")
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['LOW', 'HIGH']))
print(f"Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  FATALITY SEVERITY PREDICTION — RESULTS

ROC-AUC Score: 0.6987

Classification Report:
              precision    recall  f1-score   support

         LOW       0.96      0.68      0.79     30842
        HIGH       0.13      0.61      0.21      2363

    accuracy                           0.67     33205
   macro avg       0.54      0.65      0.50     33205
weighted avg       0.90      0.67      0.75     33205

Confusion Matrix:
[[20847  9995]
 [  910  1453]]


In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ── Left: Classification Metrics Bar Chart ──
metrics = ['ROC-AUC', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
low_values = [0.70, 0.67, 0.96, 0.68, 0.79]
high_values = [0.70, 0.67, 0.13, 0.61, 0.21]

x = np.arange(len(metrics))
width = 0.35

bars1 = axes[0].bar(x - width/2, low_values, width, label='LOW Severity',
                     color='#3b82f6', edgecolor='white')
bars2 = axes[0].bar(x + width/2, high_values, width, label='HIGH Severity',
                     color='#ef4444', edgecolor='white')

for bar in bars1:
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 f'{bar.get_height():.2f}', ha='center', fontsize=8, 
                 fontweight='bold', color='#3b82f6')
for bar in bars2:
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 f'{bar.get_height():.2f}', ha='center', fontsize=8, 
                 fontweight='bold', color='#ef4444')

axes[0].set_ylabel('Score', fontsize=11)
axes[0].set_title('Logistic Regression — Classification Metrics', 
                   fontsize=12, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics, fontsize=9)
axes[0].set_ylim(0, 1.1)
axes[0].legend(fontsize=9, loc='upper right')
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)
axes[0].axhline(y=0.5, color='gray', linestyle=':', linewidth=0.8, alpha=0.5)

# ── Right: Confusion Matrix ──
cm = np.array([[20847, 9995],
               [910, 1453]])

im = axes[1].imshow(cm, cmap='OrRd', aspect='auto')

labels = [['True Negative\n(Correct LOW)', 'False Positive\n(False Alarm)'],
          ['False Negative\n(Missed HIGH)', 'True Positive\n(Caught HIGH)']]

for i in range(2):
    for j in range(2):
        color = 'white' if cm[i, j] > 5000 else 'black'
        axes[1].text(j, i, f'{cm[i,j]:,}\n{labels[i][j]}',
                     ha='center', va='center', fontsize=10,
                     fontweight='bold', color=color)

axes[1].set_xticks([0, 1])
axes[1].set_yticks([0, 1])
axes[1].set_xticklabels(['Predicted LOW', 'Predicted HIGH'], fontsize=10)
axes[1].set_yticklabels(['Actual LOW', 'Actual HIGH'], fontsize=10)
axes[1].set_title('Confusion Matrix', fontsize=12, fontweight='bold')

plt.tight_layout(w_pad=4)
plt.savefig('../outputs/figures/model_results.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    ))
])

rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print(f"\n{'='*50}")
print(f"  RANDOM FOREST — RESULTS")
print(f"{'='*50}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_rf):.4f}")
print(f"\n{classification_report(y_test, y_pred_rf, target_names=['LOW', 'HIGH'])}")


  RANDOM FOREST — RESULTS
ROC-AUC: 0.7021

              precision    recall  f1-score   support

         LOW       0.95      0.76      0.84     30842
        HIGH       0.14      0.52      0.22      2363

    accuracy                           0.74     33205
   macro avg       0.55      0.64      0.53     33205
weighted avg       0.90      0.74      0.80     33205

