# Preprocessing and Feature Selection

In [16]:
from enum import IntEnum
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [17]:
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

from config import Config

data_path = Config.FARS_CLEANED_DIR / "fars.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ST_CASE,STATENAME,PEDS,PERNOTMVIT,VE_TOTAL,VE_FORMS,PVH_INVL,PERSONS,PERMVIT,COUNTYNAME,...,RAIL,NOT_HOUR,NOT_MIN,ARR_HOUR,ARR_MIN,HOSP_HR,HOSP_MN,FATALS,TOTAL_HARM,INTOXICATED_DRIVER_INVOLVED
0,10001,Alabama,0,0,2,1,1,1,1,TALLADEGA (121),...,0,6.0,,6,15,88,88,1,4.0,True
1,10002,Alabama,0,0,1,1,0,2,2,WALKER (127),...,0,0.0,,0,59,88,88,2,8.0,False
2,10003,Alabama,0,0,2,2,0,2,2,CHILTON (21),...,0,,,23,10,99,99,1,6.0,True
3,10004,Alabama,0,0,1,1,0,2,2,BALDWIN (3),...,0,13.0,,13,14,88,88,1,4.0,True
4,10005,Alabama,0,0,2,2,0,2,2,JEFFERSON (73),...,0,7.0,,7,28,88,88,1,6.0,False


### 2. Feature Engineering and Further Data Cleaning

We need to update/convert a few columns:
- ST_CASE is not needed anymore. 
- RAIL identifies if the crash occurred in or near a rail grade crossing. It is either 0000000 (not applicable), xxxxxxA (federal code for rail grade crossing) or 9999999 (unknown). Thus we will convert this to a binary feature. Treat 0000000 (N/A) and 9999999 (Unknown) as 0, everything else (actual codes) as 1
- Ensure severity metrics are numeric
- Convert time to cyclical encoding (model needs to see "night cycle"). For example, 23.00 is far from 01.00, but the number 23 is far from 1

In [18]:
# Rail Crossing Binary Feature
df['IS_RAIL_CROSSING'] = np.where(
    df['RAIL'].astype(str).isin(['0000000', '9999999', 'nan']), 
    0, 
    1
)

# Severity & On-Scene Metrics
severity_cols = ['FATALS', 'PERSONS', 'TOTAL_HARM']
for col in severity_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Cyclical Time Encoding
df['HOUR_SIN'] = np.sin(2 * np.pi * df['HOUR'] / 24)
df['HOUR_COS'] = np.cos(2 * np.pi * df['HOUR'] / 24)

df.head()

Unnamed: 0,ST_CASE,STATENAME,PEDS,PERNOTMVIT,VE_TOTAL,VE_FORMS,PVH_INVL,PERSONS,PERMVIT,COUNTYNAME,...,ARR_HOUR,ARR_MIN,HOSP_HR,HOSP_MN,FATALS,TOTAL_HARM,INTOXICATED_DRIVER_INVOLVED,IS_RAIL_CROSSING,HOUR_SIN,HOUR_COS
0,10001,Alabama,0,0,2,1,1,1,1,TALLADEGA (121),...,6,15,88,88,1,4.0,True,0,1.0,6.123234000000001e-17
1,10002,Alabama,0,0,1,1,0,2,2,WALKER (127),...,0,59,88,88,2,8.0,False,0,0.0,1.0
2,10003,Alabama,0,0,2,2,0,2,2,CHILTON (21),...,23,10,99,99,1,6.0,True,0,-0.5,0.8660254
3,10004,Alabama,0,0,1,1,0,2,2,BALDWIN (3),...,13,14,88,88,1,4.0,True,0,-0.258819,-0.9659258
4,10005,Alabama,0,0,2,2,0,2,2,JEFFERSON (73),...,7,28,88,88,1,6.0,False,0,0.965926,-0.258819


#### Save the feature-engineered dataset
This will be used fot the actual model training.

In [None]:
from config import Config
output_path = Config.FARS_CLEANED_DIR / "fars_model_ready.csv"
df.to_csv(output_path, index=False)
print(f"Saved preprocessed data to {output_path}")

Saved preprocessed data to c:\Users\aarthi\Documents\Uni\Y1\FDS\Project\US_accidents_project\data\processed\fars\fars_model_ready.csv


Now we need to select the features. We define categorical (ex. STATENAME, ROUTENAME,...) and numerical (ex. FATALS, IS_RAIL_CROSSING, ...) features separately to ensure easier processing. Moreover, since we will use a Random Forest to give features an importance score, we have to encode the high cardiandlty features with target encoding, since RF is very sensitiv to this.

In [20]:
target_enc_cols = ['STATENAME', 'ROUTENAME', 'HARM_EVNAME', 'REL_ROADNAME']
target = 'INTOXICATED_DRIVER_INVOLVED'

label_enc_cols = ['FUNC_SYSNAME', 'RUR_URBNAME', 'MONTH', 'DAY_WEEKNAME', 
                  'MAN_COLLNAME', 'LGT_CONDNAME', 'WEATHERNAME']

num_features = [
    'IS_RAIL_CROSSING', 'FATALS', 'PERSONS', 'TOTAL_HARM', 
    'HOUR_SIN', 'HOUR_COS' # Using cyclic hours instead of raw 'HOUR'
]

# Create X and y for feature selection
X = df[target_enc_cols + label_enc_cols + num_features].copy()
y = df[target]

We must split NOW, before encoding, to prevent leakage

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Target Encoding
We calculate the mean of the target (Impairment) for each category ONLY in the Training set. Then we map those values to the Test set.

In [22]:
global_mean = y_train.mean() # Fallback for new categories seen in test but not train

for col in target_enc_cols:
    # 1. Calculate average impairment per category (e.g., per State) in TRAIN
    # We combine X_train and y_train temporarily to group them
    train_temp = X_train.copy()
    train_temp['target'] = y_train
    
    # Calculate mappings
    category_means = train_temp.groupby(col)['target'].mean()
    
    # 2. Map these means to the columns
    # Create new column name (e.g., STATENAME_risk)
    new_col = col + '_risk_score'
    
    X_train[new_col] = X_train[col].map(category_means)
    X_test[new_col] = X_test[col].map(category_means)
    
    # 3. Handle Missing Values
    # If a category in Test wasn't in Train, fill with the global average
    X_train[new_col] = X_train[new_col].fillna(global_mean)
    X_test[new_col] = X_test[new_col].fillna(global_mean)
    
    # 4. Drop the original text column
    X_train = X_train.drop(columns=[col])
    X_test = X_test.drop(columns=[col])

#### Label Encoding
For the categorical features, we have to use an encoding. For this we will use the LabelEncoder from sklearn. For the numerical features, we will impute missing values with the median, if there still are any.

In [23]:
# Standard encoding for low-cardinality columns
le = LabelEncoder()
for col in label_enc_cols:
    # Fit on train, transform both (handling potential new labels with error bypass or string casting)
    # Using astype(str) ensures uniformity
    le.fit(X_train[col].astype(str))
    X_train[col] = le.transform(X_train[col].astype(str))
    
    # For test, we must handle unseen labels carefully. 
    # A quick hack for this demo is using fit_transform on test separately or ignoring errors,
    # but strictly, we should map unseen to a value. 
    # Here we re-fit on combined data just for the encoder structure to prevent crashing:
    combined_data = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    le.fit(combined_data)
    X_test[col] = le.transform(X_test[col].astype(str))

Fill missing values of numeric features with the median.

In [24]:
# Fill numeric NaNs
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())

X_train.head()

Unnamed: 0,FUNC_SYSNAME,RUR_URBNAME,MONTH,DAY_WEEKNAME,MAN_COLLNAME,LGT_CONDNAME,WEATHERNAME,IS_RAIL_CROSSING,FATALS,PERSONS,TOTAL_HARM,HOUR_SIN,HOUR_COS,STATENAME_risk_score,ROUTENAME_risk_score,HARM_EVNAME_risk_score,REL_ROADNAME_risk_score
38482,6,4,1,0,2,1,2,0,3,4,12.0,0.258819,0.965926,0.61027,0.558939,0.519336,0.523942
6342,7,4,2,2,0,0,2,0,1,3,10.0,0.5,0.866025,0.690374,0.530437,0.519336,0.523942
57967,0,4,2,5,2,0,2,0,1,3,4.0,-0.5,0.866025,0.713701,0.559712,0.519336,0.523942
99284,3,4,6,1,11,4,2,0,1,1,4.0,0.965926,-0.258819,0.475297,0.558939,0.544411,0.523942
84755,6,4,3,2,11,0,2,0,1,1,4.0,-0.965926,0.258819,0.592413,0.558749,0.544411,0.523942


#### Train & Evaluate
To see what features are imporant, we run a Random Forest:

In [25]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Feature Importance
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importances)

                    Feature  Importance
13     STATENAME_risk_score    0.171280
2                     MONTH    0.114846
3              DAY_WEEKNAME    0.092923
11                 HOUR_SIN    0.074881
14     ROUTENAME_risk_score    0.072680
12                 HOUR_COS    0.069817
0              FUNC_SYSNAME    0.069221
15   HARM_EVNAME_risk_score    0.062559
10               TOTAL_HARM    0.054677
9                   PERSONS    0.045576
6               WEATHERNAME    0.042786
4              MAN_COLLNAME    0.036953
5              LGT_CONDNAME    0.033430
1               RUR_URBNAME    0.025658
16  REL_ROADNAME_risk_score    0.022820
8                    FATALS    0.009099
7          IS_RAIL_CROSSING    0.000797
