# Preprocessing and Feature Selection

In [52]:
from enum import IntEnum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [59]:
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

from config import Config

data_path = Config.FARS_CLEANED_DIR / "fars.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ST_CASE,STATENAME,PEDS,PERNOTMVIT,VE_TOTAL,VE_FORMS,PVH_INVL,PERSONS,PERMVIT,COUNTYNAME,...,NOT_MIN,ARR_HOUR,ARR_MIN,HOSP_HR,HOSP_MN,FATALS,TOTAL_HARM,ALCOHOL_DRIVER_INVOLVED,DRUG_DRIVER_INVOLVED,INTOXICATED_DRIVER_INVOLVED
0,10001,Alabama,0,0,2,1,1,1,1,TALLADEGA (121),...,,6,15,88,88,1,4.0,False,False,False
1,10002,Alabama,0,0,1,1,0,2,2,WALKER (127),...,,0,59,88,88,2,8.0,False,False,False
2,10003,Alabama,0,0,2,2,0,2,2,CHILTON (21),...,,23,10,99,99,1,6.0,False,True,True
3,10004,Alabama,0,0,1,1,0,2,2,BALDWIN (3),...,,13,14,88,88,1,4.0,False,False,False
4,10005,Alabama,0,0,2,2,0,2,2,JEFFERSON (73),...,,7,28,88,88,1,6.0,False,False,False


### 2. Feature Engineering and Further Data Cleaning

We need to update/convert a few columns:
- ST_CASE is not needed anymore. 
- RAIL identifies if the crash occurred in or near a rail grade crossing. It is either 0000000 (not applicable), xxxxxxA (federal code for rail grade crossing) or 9999999 (unknown). Thus we will convert this to a binary feature. Treat 0000000 (N/A) and 9999999 (Unknown) as 0, everything else (actual codes) as 1
- Ensure severity metrics are numeric
- Convert time to cyclical encoding (model needs to see "night cycle"). For example, 23.00 is far from 01.00, but the number 23 is far from 1

In [60]:
# Rail Crossing Binary Feature
df['IS_RAIL_CROSSING'] = np.where(
    df['RAIL'].astype(str).isin(['0000000', '9999999', 'nan']), 
    0, 
    1
)

# Severity & On-Scene Metrics
severity_cols = ['FATALS', 'PERSONS', 'TOTAL_HARM']
for col in severity_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Cyclical Time Encoding
df['HOUR_SIN'] = np.sin(2 * np.pi * df['HOUR'] / 24)
df['HOUR_COS'] = np.cos(2 * np.pi * df['HOUR'] / 24)

df.head()

Unnamed: 0,ST_CASE,STATENAME,PEDS,PERNOTMVIT,VE_TOTAL,VE_FORMS,PVH_INVL,PERSONS,PERMVIT,COUNTYNAME,...,HOSP_HR,HOSP_MN,FATALS,TOTAL_HARM,ALCOHOL_DRIVER_INVOLVED,DRUG_DRIVER_INVOLVED,INTOXICATED_DRIVER_INVOLVED,IS_RAIL_CROSSING,HOUR_SIN,HOUR_COS
0,10001,Alabama,0,0,2,1,1,1,1,TALLADEGA (121),...,88,88,1,4.0,False,False,False,0,1.0,6.123234000000001e-17
1,10002,Alabama,0,0,1,1,0,2,2,WALKER (127),...,88,88,2,8.0,False,False,False,0,0.0,1.0
2,10003,Alabama,0,0,2,2,0,2,2,CHILTON (21),...,99,99,1,6.0,False,True,True,0,-0.5,0.8660254
3,10004,Alabama,0,0,1,1,0,2,2,BALDWIN (3),...,88,88,1,4.0,False,False,False,0,-0.258819,-0.9659258
4,10005,Alabama,0,0,2,2,0,2,2,JEFFERSON (73),...,88,88,1,6.0,False,False,False,0,0.965926,-0.258819


#### Save the feature-engineered dataset
This will be used fot the actual model training.

In [61]:
from config import Config
output_path = Config.FARS_CLEANED_DIR / "fars_model_ready.csv"
df.to_csv(output_path, index=False)
print(f"Saved preprocessed data to {output_path}")

Saved preprocessed data to /Users/amadeusrieck/Library/Mobile Documents/com~apple~CloudDocs/Documents/Studium/Masterstudium/ERASMUS/Data_Science_Found/US_accidents_project/data/processed/fars/fars_model_ready.csv


Now we need to select the features. We define categorical (ex. STATENAME, ROUTENAME,...) and numerical (ex. FATALS, IS_RAIL_CROSSING, ...) features separately to ensure easier processing. Moreover, since we will use a Random Forest to give features an importance score, we have to encode the high cardiandlty features with target encoding, since RF is very sensitiv to this.

In [62]:
target_enc_cols = [
    'STATENAME', 'ROUTENAME', 'HARM_EVNAME', 'REL_ROADNAME',
    'FUNC_SYSNAME', 'RUR_URBNAME', 'MONTH', 'DAY_WEEKNAME',
    'MAN_COLLNAME', 'LGT_CONDNAME', 'WEATHERNAME']  # using only target encoding (got rid of the labelencoder)

target = 'INTOXICATED_DRIVER_INVOLVED'

num_features = [
    'IS_RAIL_CROSSING', 'FATALS', 'PERSONS', 'TOTAL_HARM', 
    'HOUR_SIN', 'HOUR_COS' # Using cyclic hours instead of raw 'HOUR'
]

# Create X and y for feature selection
X = df[target_enc_cols + num_features].copy()
y = df[target]

We must split NOW, before encoding, to prevent leakage

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Target Encoding (extended)
We calculate the mean of the target (Impairment) for each category ONLY in the Training set. Then we map those values to the Test set.

In [64]:
global_mean = y_train.mean() # Fallback for new categories seen in test but not train

for col in target_enc_cols:
    # 1. Calculate average impairment per category (e.g., per State) in TRAIN
    # We combine X_train and y_train temporarily to group them
    train_temp = X_train.copy()
    train_temp['target'] = y_train
    
    # Calculate mappings
    category_means = train_temp.groupby(col)['target'].mean()
    
    # 2. Map these means to the columns
    # Create new column name (e.g., STATENAME_risk)
    new_col = col + '_risk_score'
    
    X_train[new_col] = X_train[col].map(category_means)
    X_test[new_col] = X_test[col].map(category_means)
    
    # 3. Handle Missing Values
    # If a category in Test wasn't in Train, fill with the global average
    X_train[new_col] = X_train[new_col].fillna(global_mean)
    X_test[new_col] = X_test[new_col].fillna(global_mean)
    
    # 4. Drop the original text column
    X_train = X_train.drop(columns=[col])
    X_test = X_test.drop(columns=[col])

#### Label Encoding
For the categorical features, we have to use an encoding. For this we will use the LabelEncoder from sklearn. For the numerical features, we will impute missing values with the median, if there still are any.

In [None]:
'''
# Standard encoding for low-cardinality columns
le = LabelEncoder()
for col in label_enc_cols:
    # Fit on train, transform both (handling potential new labels with error bypass or string casting)
    # Using astype(str) ensures uniformity
    le.fit(X_train[col].astype(str))
    X_train[col] = le.transform(X_train[col].astype(str))
    
    # For test, we must handle unseen labels carefully. 
    # A quick hack for this demo is using fit_transform on test separately or ignoring errors,
    # but strictly, we should map unseen to a value. 
    # Here we re-fit on combined data just for the encoder structure to prevent crashing:
    combined_data = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    le.fit(combined_data)
    X_test[col] = le.transform(X_test[col].astype(str))
'''

Fill missing values of numeric features with the median.

In [65]:
# Fill numeric NaNs
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())

X_train.head()

Unnamed: 0,IS_RAIL_CROSSING,FATALS,PERSONS,TOTAL_HARM,HOUR_SIN,HOUR_COS,STATENAME_risk_score,ROUTENAME_risk_score,HARM_EVNAME_risk_score,REL_ROADNAME_risk_score,FUNC_SYSNAME_risk_score,RUR_URBNAME_risk_score,MONTH_risk_score,DAY_WEEKNAME_risk_score,MAN_COLLNAME_risk_score,LGT_CONDNAME_risk_score,WEATHERNAME_risk_score
38482,0,3,4,12.0,0.258819,0.965926,0.338263,0.314861,0.364888,0.273092,0.30115,0.319533,0.331464,0.340361,0.355408,0.438605,0.347347
6342,0,1,3,10.0,0.5,0.866025,0.449713,0.31482,0.364888,0.273092,0.38127,0.319533,0.321332,0.41424,0.3055,0.372441,0.347347
57967,0,1,3,4.0,-0.5,0.866025,0.416496,0.343885,0.364888,0.273092,0.346224,0.319533,0.321332,0.298489,0.355408,0.372441,0.347347
99284,0,1,1,4.0,0.965926,-0.258819,0.311164,0.314861,0.075529,0.273092,0.337007,0.319533,0.367656,0.305132,0.331295,0.279677,0.347347
84755,0,1,1,4.0,-0.965926,0.258819,0.30912,0.355883,0.075529,0.273092,0.30115,0.319533,0.313342,0.41424,0.331295,0.372441,0.347347


#### Train & Evaluate
To see what features are imporant, we run a Random Forest:

In [66]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Feature Importance
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importances)

                    Feature  Importance
6      STATENAME_risk_score    0.128089
8    HARM_EVNAME_risk_score    0.121779
12         MONTH_risk_score    0.099091
5                  HOUR_COS    0.078806
13  DAY_WEEKNAME_risk_score    0.072044
2                   PERSONS    0.066743
4                  HOUR_SIN    0.061230
7      ROUTENAME_risk_score    0.059513
10  FUNC_SYSNAME_risk_score    0.058903
3                TOTAL_HARM    0.057374
16   WEATHERNAME_risk_score    0.042158
9   REL_ROADNAME_risk_score    0.041829
14  MAN_COLLNAME_risk_score    0.040806
15  LGT_CONDNAME_risk_score    0.038695
11   RUR_URBNAME_risk_score    0.022750
1                    FATALS    0.009621
0          IS_RAIL_CROSSING    0.000570


In [71]:
df[['ALCOHOL_DRIVER_INVOLVED','DRUG_DRIVER_INVOLVED','INTOXICATED_DRIVER_INVOLVED']].mean()*100

ALCOHOL_DRIVER_INVOLVED        22.186370
DRUG_DRIVER_INVOLVED           18.868266
INTOXICATED_DRIVER_INVOLVED    34.527889
dtype: float64

#### Test Drug Data

In [68]:
df['DRUG_DRIVER_INVOLVED'].value_counts(dropna=False)

DRUG_DRIVER_INVOLVED
False    85237
True     19823
Name: count, dtype: int64

### Check actual DRUGRENAME categories in dataset

In [69]:
import glob

paths = glob.glob(str(Config.FARS_RAW_DIR / "*" / "drugs.csv"))

dfs = [pd.read_csv(p, encoding="ISO-8859-1") for p in paths]
df_drugs = pd.concat(dfs, ignore_index=True)

df_drugs['DRUGRESNAME'].value_counts(dropna=False).head(50)

DRUGRESNAME
Test Not Given                                                  200825
Tested, No Drugs Found/Negative                                  43317
Not Reported                                                     30870
None Detected/Below Threshold                                    20483
Reported as Unknown if Tested for Drugs                          14386
Other Drug                                                       11174
Delta 9-tetrahydrocannabinol [THC]                                9212
11-nor-9-carboxy-delta-9- tetrahydrocannabinol (Carboxy THC)      8020
Tetrahydrocannabinols (THC)                                       6751
Amphetamine                                                       6494
Methamphetamine                                                   6296
METHAMPHETAMINE                                                   4911
AMPHETAMINE                                                       4654
11-hydroxy-delta-9-tetrahydrocannabinol (Hydroxy-THC)            

#### NOW save fars_model_ready.csv again to use for Modelling

In [72]:
output_path = Config.FARS_CLEANED_DIR / "fars_model_ready.csv"
final_df = pd.concat([X_train, y_train], axis=1)  # or df with encoded variables
final_df.to_csv(output_path, index=False)
print("Saved model-ready dataset:", output_path)

Saved model-ready dataset: /Users/amadeusrieck/Library/Mobile Documents/com~apple~CloudDocs/Documents/Studium/Masterstudium/ERASMUS/Data_Science_Found/US_accidents_project/data/processed/fars/fars_model_ready.csv
