# Preprocessing and Feature Selection

In [49]:
from enum import IntEnum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [50]:
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

from config import Config

data_path = Config.FARS_CLEANED_DIR / "fars.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ST_CASE,STATENAME,PERSONS,ROUTENAME,FUNC_SYSNAME,RUR_URBNAME,MONTH,DAY,DAY_WEEKNAME,HOUR,...,TYP_INTNAME,REL_ROADNAME,WRK_ZONE,LGT_CONDNAME,WEATHERNAME,SCH_BUSNAME,RAIL,TOTAL_HARM,DRUNK_DRIVERS,DRIVERS_ON_DRUGS
0,10001,Alabama,1,U.S. Highway,Major Collector,Rural,1,4,Wednesday,22.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,0.0,0.0
1,10002,Alabama,1,County,Local,Rural,3,20,Monday,2.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,1.0,1.0
2,10003,Alabama,1,County,Major Collector,Rural,3,18,Saturday,22.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,1.0,0.0
3,10004,Alabama,3,County,Local,Rural,3,17,Friday,19.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Rain,No,0,10.0,1.0,0.0
4,10005,Alabama,4,Interstate,Interstate,Rural,3,17,Friday,14.0,...,Not an Intersection,On Roadside,0,Daylight,Rain,No,0,12.0,0.0,1.0


### 1. Target Variable Definition

Since the dataset currently includes DRUNK_DRIVERS and DRIVERS_ON_DRUGS and we want to predict if the driver was "in some way impaired", we have combine both features into a single binary class.

In [51]:
# Create the binary target: 1 if impaired, 0 otherwise
target = 'IMPAIRED'
df[target] = ((df['DRUNK_DRIVERS'] > 0) | (df['DRIVERS_ON_DRUGS'] > 0)).astype(int)

### 2. Feature Engineering and Further Data Cleaning

We need to update/convert a few columns:
- ST_CASE is not needed anymore. 
- RAIL identifies if the crash occurred in or near a rail grade crossing. It is either 0000000 (not applicable), xxxxxxA (federal code for rail grade crossing) or 9999999 (unknown). Thus we will convert this to a binary feature. Treat 0000000 (N/A) and 9999999 (Unknown) as 0, everything else (actual codes) as 1
- Ensure severity metrics are numeric
- Convert time to cyclical encoding (model needs to see "night cycle"). For example, 23.00 is far from 01.00, but the number 23 is far from 1

In [62]:
# Rail Crossing Binary Feature
df['IS_RAIL_CROSSING'] = np.where(
    df['RAIL'].astype(str).isin(['0000000', '9999999', 'nan']), 
    0, 
    1
)

# Severity & On-Scene Metrics
severity_cols = ['FATALS', 'PERSONS', 'TOTAL_HARM']
for col in severity_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Cyclical Time Encoding
df['HOUR_SIN'] = np.sin(2 * np.pi * df['HOUR'] / 24)
df['HOUR_COS'] = np.cos(2 * np.pi * df['HOUR'] / 24)

df.head()

Unnamed: 0,ST_CASE,STATENAME,PERSONS,ROUTENAME,FUNC_SYSNAME,RUR_URBNAME,MONTH,DAY,DAY_WEEKNAME,HOUR,...,WEATHERNAME,SCH_BUSNAME,RAIL,TOTAL_HARM,DRUNK_DRIVERS,DRIVERS_ON_DRUGS,IMPAIRED,IS_RAIL_CROSSING,HOUR_SIN,HOUR_COS
0,10001,Alabama,1,U.S. Highway,Major Collector,Rural,1,4,Wednesday,22.0,...,Clear,No,0,4.0,0.0,0.0,0,0,-0.5,0.866025
1,10002,Alabama,1,County,Local,Rural,3,20,Monday,2.0,...,Clear,No,0,4.0,1.0,1.0,1,0,0.5,0.866025
2,10003,Alabama,1,County,Major Collector,Rural,3,18,Saturday,22.0,...,Clear,No,0,4.0,1.0,0.0,1,0,-0.5,0.866025
3,10004,Alabama,3,County,Local,Rural,3,17,Friday,19.0,...,Rain,No,0,10.0,1.0,0.0,1,0,-0.965926,0.258819
4,10005,Alabama,4,Interstate,Interstate,Rural,3,17,Friday,14.0,...,Rain,No,0,12.0,0.0,1.0,1,0,-0.5,-0.866025


#### Save the feature-engineered dataset
This will be used fot the actual model training.

In [63]:
from config import Config
output_path = Config.FARS_CLEANED_DIR / "fars_model_ready.csv"
df.to_csv(output_path, index=False)
print(f"Saved preprocessed data to {output_path}")

Saved preprocessed data to /Users/rafaelgufler/Documents/master/project/US_accidents_project/data/processed/fars/fars_model_ready.csv


Now we need to select the features. We define categorical (ex. STATENAME, ROUTENAME,...) and numerical (ex. FATALS, IS_RAIL_CROSSING, ...) features separately to ensure easier processing. Moreover, since we will use a Random Forest to give features an importance score, we have to encode the high cardiandlty features with target encoding, since RF is very sensitiv to this.

In [53]:
target_enc_cols = ['STATENAME', 'ROUTENAME', 'HARM_EVNAME', 'REL_ROADNAME']

label_enc_cols = ['FUNC_SYSNAME', 'RUR_URBNAME', 'MONTH', 'DAY_WEEKNAME', 
                  'MAN_COLLNAME', 'LGT_CONDNAME', 'WEATHERNAME']

num_features = [
    'IS_RAIL_CROSSING', 'FATALS', 'PERSONS', 'TOTAL_HARM', 
    'HOUR_SIN', 'HOUR_COS' # Using cyclic hours instead of raw 'HOUR'
]

# Create X and y for feature selection
X = df[target_enc_cols + label_enc_cols + num_features].copy()
y = df[target]

We must split NOW, before encoding, to prevent leakage

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Target Encoding
We calculate the mean of the target (Impairment) for each category ONLY in the Training set. Then we map those values to the Test set.

In [55]:
global_mean = y_train.mean() # Fallback for new categories seen in test but not train

for col in target_enc_cols:
    # 1. Calculate average impairment per category (e.g., per State) in TRAIN
    # We combine X_train and y_train temporarily to group them
    train_temp = X_train.copy()
    train_temp['target'] = y_train
    
    # Calculate mappings
    category_means = train_temp.groupby(col)['target'].mean()
    
    # 2. Map these means to the columns
    # Create new column name (e.g., STATENAME_risk)
    new_col = col + '_risk_score'
    
    X_train[new_col] = X_train[col].map(category_means)
    X_test[new_col] = X_test[col].map(category_means)
    
    # 3. Handle Missing Values
    # If a category in Test wasn't in Train, fill with the global average
    X_train[new_col] = X_train[new_col].fillna(global_mean)
    X_test[new_col] = X_test[new_col].fillna(global_mean)
    
    # 4. Drop the original text column
    X_train = X_train.drop(columns=[col])
    X_test = X_test.drop(columns=[col])

#### Label Encoding
For the categorical features, we have to use an encoding. For this we will use the LabelEncoder from sklearn. For the numerical features, we will impute missing values with the median, if there still are any.

In [56]:
# Standard encoding for low-cardinality columns
le = LabelEncoder()
for col in label_enc_cols:
    # Fit on train, transform both (handling potential new labels with error bypass or string casting)
    # Using astype(str) ensures uniformity
    le.fit(X_train[col].astype(str))
    X_train[col] = le.transform(X_train[col].astype(str))
    
    # For test, we must handle unseen labels carefully. 
    # A quick hack for this demo is using fit_transform on test separately or ignoring errors,
    # but strictly, we should map unseen to a value. 
    # Here we re-fit on combined data just for the encoder structure to prevent crashing:
    combined_data = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    le.fit(combined_data)
    X_test[col] = le.transform(X_test[col].astype(str))

Fill missing values of numeric features with the median.

In [60]:
# Fill numeric NaNs
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())

X_train.head()

Unnamed: 0,FUNC_SYSNAME,RUR_URBNAME,MONTH,DAY_WEEKNAME,MAN_COLLNAME,LGT_CONDNAME,WEATHERNAME,IS_RAIL_CROSSING,FATALS,PERSONS,TOTAL_HARM,HOUR_SIN,HOUR_COS,STATENAME_risk_score,ROUTENAME_risk_score,HARM_EVNAME_risk_score,REL_ROADNAME_risk_score
15159,3,1,3,5,8,4,2,0,1,3,4.0,-0.5,-0.866025,0.277193,0.402837,0.399118,0.371508
15041,2,1,10,3,9,4,11,0,1,1,4.0,0.5,-0.866025,0.277193,0.367894,0.376344,0.472025
4475,3,4,1,4,9,4,2,0,1,1,4.0,-0.866025,-0.5,0.311111,0.303833,0.301676,0.371508
6562,7,1,6,4,7,4,2,0,1,3,6.0,0.866025,-0.5,0.564394,0.402837,0.399118,0.371508
36433,0,4,7,2,2,0,2,0,1,6,4.0,-0.5,0.866025,0.672978,0.410346,0.399118,0.371508


#### Train & Evaluate
To see what features are imporant, we run a Random Forest:

In [61]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Feature Importance
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importances)

                    Feature  Importance
13     STATENAME_risk_score    0.196038
2                     MONTH    0.107698
3              DAY_WEEKNAME    0.087187
11                 HOUR_SIN    0.074749
12                 HOUR_COS    0.072210
14     ROUTENAME_risk_score    0.069732
0              FUNC_SYSNAME    0.066776
15   HARM_EVNAME_risk_score    0.064645
10               TOTAL_HARM    0.052423
9                   PERSONS    0.048372
6               WEATHERNAME    0.038713
5              LGT_CONDNAME    0.036402
4              MAN_COLLNAME    0.026014
1               RUR_URBNAME    0.024692
16  REL_ROADNAME_risk_score    0.024107
8                    FATALS    0.009380
7          IS_RAIL_CROSSING    0.000862
