# Preprocessing and Feature Selection

In [18]:
from enum import IntEnum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pathlib import Path
import sys

In [19]:
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

from config import Config

data_path = Config.FARS_CLEANED_DIR / "fars.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ST_CASE,STATENAME,PERSONS,ROUTENAME,FUNC_SYSNAME,RUR_URBNAME,MONTH,DAY,DAY_WEEKNAME,HOUR,...,TYP_INTNAME,REL_ROADNAME,WRK_ZONE,LGT_CONDNAME,WEATHERNAME,SCH_BUSNAME,RAIL,TOTAL_HARM,DRUNK_DRIVERS,DRIVERS_ON_DRUGS
0,10001,Alabama,1,U.S. Highway,Major Collector,Rural,1,4,Wednesday,22.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,0.0,0.0
1,10002,Alabama,1,County,Local,Rural,3,20,Monday,2.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,1.0,1.0
2,10003,Alabama,1,County,Major Collector,Rural,3,18,Saturday,22.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Clear,No,0,4.0,1.0,0.0
3,10004,Alabama,3,County,Local,Rural,3,17,Friday,19.0,...,Not an Intersection,On Roadside,0,Dark - Not Lighted,Rain,No,0,10.0,1.0,0.0
4,10005,Alabama,4,Interstate,Interstate,Rural,3,17,Friday,14.0,...,Not an Intersection,On Roadside,0,Daylight,Rain,No,0,12.0,0.0,1.0


### 1. Target Variable Definition

Since the dataset currently includes DRUNK_DRIVERS and DRIVERS_ON_DRUGS and we want to predict if the driver was "in some way impaired", we have combine both features into a single binary class.

In [None]:
# Create the binary target: 1 if impaired, 0 otherwise
target = 'IMPAIRED'
df = df[target] = ((df['DRUNK_DRIVERS'] > 0) | (df['DRIVERS_ON_DRUGS'] > 0)).astype(int)
# df = df.drop(columns=['DRUNK_DRIVERS', 'DRIVERS_ON_DRUGS'])

### 2. Feature Engineering and Further Data Cleaning

We need to update/convert a few columns:
- ST_CASE is not needed anymore. 
- RAIL identifies if the crash occurred in or near a rail grade crossing. It is either 0000000 (not applicable), xxxxxxA (federal code for rail grade crossing) or 9999999 (unknown). Thus we will convert this to a binary feature. Treat 0000000 (N/A) and 9999999 (Unknown) as 0, everything else (actual codes) as 1
- Ensure severity metrics are numeric
- Convert time to cyclical encoding (model needs to see "night cycle"). For example, 23.00 is far from 01.00, but the number 23 is far from 1

In [10]:
# Rail Crossing Binary Feature
df['IS_RAIL_CROSSING'] = np.where(
    df['RAIL'].astype(str).isin(['0000000', '9999999']), 
    0, 
    1
)

# Severity & On-Scene Metrics
severity_cols = ['FATALS', 'PERSONS', 'TOTAL_HARM']
for col in severity_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Cyclical Time Encoding
df['HOUR_SIN'] = np.sin(2 * np.pi * df['HOUR'] / 24)
df['HOUR_COS'] = np.cos(2 * np.pi * df['HOUR'] / 24)

KeyError: 'RAIL'