# Step 2 - Feature Engineering for FARS Analysis

This is the code for the second step of our pipeline.
In summary:
* Create interpretable features for FARS Data
* Create categories of some features for a better understanding/analysis
* Create features related to diverse areas (Ex. temporal, demographic, environmental, etc)

Feauture engineering was an additional step we decided to include in order to have enhanced features that can provide more information. 

A limitation of this approach is that, as there are many features, a lot of combinations or ways to change them can be made. Due to time constraints and the massive amount of data, this approach considered a good amount of engineered features, but not all the possible ones.

In [1]:
from pathlib import Path
from typing import Optional
import polars as pl

### How to run the code:

1) Run libraries
2) Run all the sections in order (top to bottom)
3) Run the Use
4) Optional: review the code of each section

### Features

Within this section, all the different types of features are created:

##### Temporal features
- Features created:
    - TIME_OF_DAY: Morning/Afternoon/Evening/Night based on HOUR
    - WEEKEND_FLAG: Weekend/Weekday based on DAY_WEEK
    - RUSH_HOUR: 1 if weekday 7-9am or 4-7pm, else 0
    - SEASON: Winter/Spring/Summer/Fall based on MONTH
    - HOLIDAY_PERIOD: 1 if major holiday period, else 0
    - NIGHT: 1 if TIME_OF_DAY is Night, else 0

In [2]:
def create_temporal_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Time of Day
    if "HOUR" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("HOUR").is_between(6, 11))
            .then(pl.lit("Morning"))
            .when(pl.col("HOUR").is_between(12, 17))
            .then(pl.lit("Afternoon"))
            .when(pl.col("HOUR").is_between(18, 21))
            .then(pl.lit("Evening"))
            .when((pl.col("HOUR") >= 22) | (pl.col("HOUR") < 6))
            .then(pl.lit("Night"))
            .otherwise(None)
            .alias("TIME_OF_DAY")
        ])
        features_added += 1
    
    # Weekend / Weekday Flag
    if "DAY_WEEK" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("DAY_WEEK").is_in([1, 7]))  # 1=Sun, 7=Sat
            .then(pl.lit("Weekend"))
            .when(pl.col("DAY_WEEK").is_between(2, 6))
            .then(pl.lit("Weekday"))
            .otherwise(None)
            .alias("WEEKEND_FLAG")
        ])
        features_added += 1
    
    # Rush Hour
    if "HOUR" in df.columns and "DAY_WEEK" in df.columns:
        df = df.with_columns([
            pl.when(
                (pl.col("DAY_WEEK").is_between(2, 6)) &
                ((pl.col("HOUR").is_between(7, 9)) | (pl.col("HOUR").is_between(16, 19)))
            )
            .then(1)
            .otherwise(0)
            .alias("RUSH_HOUR")
        ])
        features_added += 1
    
    # Season (USA, as FARS data is from there)
    if "MONTH" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("MONTH").is_in([12, 1, 2]))
            .then(pl.lit("Winter"))
            .when(pl.col("MONTH").is_in([3, 4, 5]))
            .then(pl.lit("Spring"))
            .when(pl.col("MONTH").is_in([6, 7, 8]))
            .then(pl.lit("Summer"))
            .when(pl.col("MONTH").is_in([9, 10, 11]))
            .then(pl.lit("Fall"))
            .otherwise(None)
            .alias("SEASON")
        ])
        features_added += 1
    
    # Holiday Period (USA, as FARS data is from there)
    if "MONTH" in df.columns and "DAY" in df.columns:
        df = df.with_columns([
            pl.when(
                ((pl.col("MONTH") == 12) & (pl.col("DAY") >= 28)) |  # New Year
                ((pl.col("MONTH") == 1) & (pl.col("DAY") <= 3)) |
                ((pl.col("MONTH") == 7) & (pl.col("DAY").is_between(1, 7))) |  # July 4th
                ((pl.col("MONTH") == 11) & (pl.col("DAY").is_between(22, 28))) |  # Thanksgiving
                ((pl.col("MONTH") == 12) & (pl.col("DAY").is_between(22, 28)))  # Christmas
            )
            .then(1)
            .otherwise(0)
            .alias("HOLIDAY_PERIOD")
        ])
        features_added += 1
    
    # Night flag
    if "TIME_OF_DAY" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("TIME_OF_DAY") == "Night")
            .then(1)
            .otherwise(0)
            .alias("NIGHT")
        ])
        features_added += 1
    
    print(f"Created {features_added} temporal features")
    return df

##### Demographic features
- Features created:
    - AGE_GROUP: Teen/Young/Adult/Senior based on AGE
    - YOUNG_DRIVER: 1 if AGE < 25, else 0
    - SENIOR_DRIVER: 1 if AGE >= 65, else 0
    - MALE: 1 if SEX == 1, else 0

In [3]:
def create_demographic_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Age Groups
    if "AGE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("AGE").is_between(16, 19))
            .then(pl.lit("Teen"))
            .when(pl.col("AGE").is_between(20, 24))
            .then(pl.lit("Young"))
            .when(pl.col("AGE").is_between(25, 64))
            .then(pl.lit("Adult"))
            .when(pl.col("AGE") >= 65)
            .then(pl.lit("Senior"))
            .otherwise(None)
            .alias("AGE_GROUP")
        ])
        features_added += 1
    
    # Young Driver Flag
    if "AGE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("AGE") < 25)
            .then(1)
            .otherwise(0)
            .alias("YOUNG_DRIVER")
        ])
        features_added += 1
    
    # Senior Driver Flag
    if "AGE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("AGE") >= 65)
            .then(1)
            .otherwise(0)
            .alias("SENIOR_DRIVER")
        ])
        features_added += 1
    
    # Male Driver Flag
    if "SEX" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("SEX") == 1)
            .then(1)
            .otherwise(0)
            .alias("MALE")
        ])
        features_added += 1
    
    print(f"Created {features_added} demographic features")
    return df

##### Risk Factor features

These features introduce potential bias because they represent factors already known to be associated with crash risk. Alcohol use, drug involvement, speeding, and lack of restraint use are penalized in most places due to their known relationship with fatal accidents. While these features are valuable for exploratory analysis and understanding crash patterns, they should be carefully considered for clustering to avoid circular reasoning (we do not want to rediscover what is already established rather than reveal novel latent patterns).

- Features created:
    - ALCOHOL_INVOLVED: 1 if alcohol detected, else 0
    - DRUG_INVOLVED: 1 if drugs detected, else 0
    - SPEEDING: 1 if speed-related, else 0
    - NO_RESTRAINT: 1 if no restraint used, else 0
    - RISK_COUNT: Total count of risk factors present

In [4]:
def create_risk_factor_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Alcohol Involved (nulls/not reported are treated as abscence, 0)
    if "DRINKING" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("DRINKING") == 1)
            .then(1)
            .otherwise(0)
            .alias("ALCOHOL_INVOLVED")
        ])
        features_added += 1
    
    # Drug Involved
    if "DRUGS" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("DRUGS") == 1)
            .then(1)
            .otherwise(0)
            .alias("DRUG_INVOLVED")
        ])
        features_added += 1
    
    # Speeding
    if "SPEEDREL" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("SPEEDREL") >= 1)
            .then(1)
            .otherwise(0)
            .alias("SPEEDING")
        ])
        features_added += 1
    
    # No Restraint (missing codes are considered as absence)
    if "REST_USE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("REST_USE").is_null())
            .then(0)
            .otherwise(1)
            .alias("NO_RESTRAINT")
        ])
        features_added += 1
    
    # Risk Count (sum of risk factors)
    risk_vars = ["ALCOHOL_INVOLVED", "DRUG_INVOLVED", "SPEEDING", 
                 "NO_RESTRAINT", "YOUNG_DRIVER"]
    available_risk_vars = [v for v in risk_vars if v in df.columns]
    
    if len(available_risk_vars) >= 2:
        df = df.with_columns([
            sum([pl.col(v) for v in available_risk_vars]).alias("RISK_COUNT")
        ])
        features_added += 1
    
    print(f"Created {features_added} risk factor features")
    return df


##### Environmental Features
- Features created:
    - ADVERSE_WEATHER: 1 if bad weather, else 0
    - DARK_CONDITIONS: 1 if dark/poorly lit, else 0

In [5]:
def create_environmental_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Adverse Weather
    if "WEATHER" in df.columns or "WEATHER1" in df.columns:
        weather_col = "WEATHER" if "WEATHER" in df.columns else "WEATHER1"
        # Codes: 1=Clear, 2-10=Various adverse conditions, 11-12=More adverse
        df = df.with_columns([
            pl.when(pl.col(weather_col).is_in([2, 3, 4, 5, 6, 7, 8, 10, 11, 12]))
            .then(1)
            .otherwise(0)
            .alias("ADVERSE_WEATHER")
        ])
        features_added += 1
    
    # Dark Conditions
    if "LGT_COND" in df.columns:
        # Codes: 1=Daylight, 2=Dark-not lighted, 3=Dark-lighted, 4=Dawn, 5=Dusk, 6=Dark unknown lighting
        df = df.with_columns([
            pl.when(pl.col("LGT_COND").is_in([2, 3, 4, 5, 6]))
            .then(1)
            .otherwise(0)
            .alias("DARK_CONDITIONS")
        ])
        features_added += 1
    
    print(f"Created {features_added} environmental features")
    return df

##### Vehicle Features
- Features created:
    - VEHICLE_AGE: YEAR - MOD_YEAR
    - OLD_VEHICLE: 1 if vehicle age > 15 years, else 0
    - PASSENGER_CAR: 1 if passenger car, else 0
    - LARGE_TRUCK: 1 if large truck, else 0
    - MOTORCYCLE: 1 if motorcycle, else 0

In [6]:
def create_vehicle_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Vehicle Age
    if "MOD_YEAR" in df.columns and "YEAR" in df.columns:
        df = df.with_columns([
            (pl.col("YEAR") - pl.col("MOD_YEAR")).alias("VEHICLE_AGE")
        ])
        features_added += 1
    
    # Old Vehicle Flag
    if "VEHICLE_AGE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("VEHICLE_AGE") > 15)
            .then(1)
            .otherwise(0)
            .alias("OLD_VEHICLE")
        ])
        features_added += 1
    
    # Separation of different vehicle types
    if "BODY_TYP" in df.columns:
        # Passenger Car (codes 1-19)
        df = df.with_columns([
            pl.when(pl.col("BODY_TYP").is_between(1, 19))
            .then(1)
            .otherwise(0)
            .alias("PASSENGER_CAR")
        ])
        features_added += 1
        
        # Large Truck (codes 60-79)
        df = df.with_columns([
            pl.when(pl.col("BODY_TYP").is_between(60, 79))
            .then(1)
            .otherwise(0)
            .alias("LARGE_TRUCK")
        ])
        features_added += 1
        
        # Motorcycle (codes 80-89)
        df = df.with_columns([
            pl.when(pl.col("BODY_TYP").is_between(80, 89))
            .then(1)
            .otherwise(0)
            .alias("MOTORCYCLE")
        ])
        features_added += 1
    
    print(f"Created {features_added} vehicle features")
    return df

##### Geographic Features
- Features created:
    - RURAL: 1 if rural area, else 0
    - URBAN: 1 if urban area, else 0
    - INTERSTATE: 1 if interstate highway, else 0

In [7]:
def create_geographic_features(df: pl.DataFrame) -> pl.DataFrame:

    features_added = 0
    
    # Separate rural and urban
    if "RUR_URB" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("RUR_URB") == 1)
            .then(1)
            .otherwise(0)
            .alias("RURAL")
        ])
        features_added += 1
        
        # Urban Flag
        df = df.with_columns([
            pl.when(pl.col("RUR_URB") == 2)
            .then(1)
            .otherwise(0)
            .alias("URBAN")
        ])
        features_added += 1
    
    # Separate Interstate
    if "ROUTE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("ROUTE") == 1)
            .then(1)
            .otherwise(0)
            .alias("INTERSTATE")
        ])
        features_added += 1
    
    print(f"Created {features_added} geographic features")
    return df

##### Crash Characteristics Features
- Features created:
    - INTERSECTION: 1 if at/near intersection, else 0
    - WORK_ZONE_CRASH: 1 if in work zone, else 0
    - ROLLOVER: 1 if vehicle rolled over, else 0
    - FIRE: 1 if fire occurred, else 0

In [8]:
def create_crash_features(df: pl.DataFrame) -> pl.DataFrame:
    
    features_added = 0
    
    # Intersection
    if "RELJCT2" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("RELJCT2").is_in([2, 3, 4, 5, 6, 7, 8]))
            .then(1)
            .otherwise(0)
            .alias("INTERSECTION")
        ])
        features_added += 1
    
    # Work Zone
    if "WRK_ZONE" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("WRK_ZONE").is_in([1, 2, 3, 4]))
            .then(1)
            .otherwise(0)
            .alias("WORK_ZONE_CRASH")
        ])
        features_added += 1
    
    # Rollover
    if "ROLLOVER" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("ROLLOVER") == 3)
            .then(1)
            .otherwise(0)
            .alias("ROLLOVER_CRASH")
        ])
        features_added += 1
    
    # Fire
    if "FIRE_EXP" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("FIRE_EXP").is_in([1, 2])) # Only 1 is needed in more recent versions, but the 2 can be kept
            .then(1)
            .otherwise(0)
            .alias("FIRE")
        ])
        features_added += 1
    
    print(f"Created {features_added} crash characteristic features")
    return df

##### Outcome Features
- Features created:
    - FATAL: 1 if fatal injury (INJ_SEV=4), else 0
    - SERIOUS_INJURY: 1 if serious injury (INJ_SEV=3), else 0
    - ANY_INJURY: 1 if any injury (INJ_SEV in 2,3,4), else 0

In [9]:
def create_outcome_features(df: pl.DataFrame) -> pl.DataFrame:
   
    features_added = 0
    
    # Fatal
    if "INJ_SEV" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("INJ_SEV") == 4)
            .then(1)
            .otherwise(0)
            .alias("FATAL")
        ])
        features_added += 1
    
    # Serious Injury
    if "INJ_SEV" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("INJ_SEV") == 3)
            .then(1)
            .otherwise(0)
            .alias("SERIOUS_INJURY")
        ])
        features_added += 1
    
    # Any Injury (just consider the confirmed ones)
    if "INJ_SEV" in df.columns:
        df = df.with_columns([
            pl.when(pl.col("INJ_SEV").is_in([2, 3, 4]))
            .then(1)
            .otherwise(0)
            .alias("ANY_INJURY")
        ])
        features_added += 1
    
    print(f"Created {features_added} outcome features")
    return df

### Pipeline

Run the complete feature engineering step

In [12]:
def run_feature_engineering_pipeline(
    df: pl.DataFrame,
    output_file: Optional[Path] = None
) -> pl.DataFrame:

    # Store original columns to track new features
    original_columns = df.columns.copy()
    
    # Run all feature engineering steps
    print("\nTEMPORAL FEATURES")
    df = create_temporal_features(df)
    
    print("\nDEMOGRAPHIC FEATURES")
    df = create_demographic_features(df)
    
    print("\nRISK FACTOR FEATURES")
    df = create_risk_factor_features(df)
    
    print("\nENVIRONMENTAL FEATURES")
    df = create_environmental_features(df)
    
    print("\nVEHICLE FEATURES")
    df = create_vehicle_features(df)
    
    print("\nGEOGRAPHIC FEATURES")
    df = create_geographic_features(df)
    
    print("\nCRASH CHARACTERISTIC FEATURES")
    df = create_crash_features(df)
    
    print("\nOUTCOME FEATURES")
    df = create_outcome_features(df)
    
    # Calculate new features created
    new_features = [col for col in df.columns if col not in original_columns]
    
    # Save outputs
    if output_file is not None:
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Save parquet
        df.write_parquet(output_path)
        print(f"Saved parquet to: {output_path}")
        
        # Save csv (optional, just for check) !!! EXTREMELY HEAVY
        # csv_path = output_path.with_suffix('.csv')
        # df.write_csv(csv_path)
        # print(f"Saved CSV to: {csv_path}")
        
        # Save feature list (optional, but useful)
        feature_list_path = output_path.parent / "step2_feature_list.txt"
        with open(feature_list_path, 'w') as f:
            f.write("Engineered Features\n")
            f.write("=" * 60 + "\n\n")
            f.write(f"Total new features: {len(new_features)}\n\n")
            for feat in new_features:
                f.write(f"  - {feat}\n")
        print(f"Saved feature list to: {feature_list_path}")
    
    # Print summary
    print(f"Final shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"Original features: {len(original_columns)}")
    print(f"New features: {len(new_features)}")
    print(f"Total features: {len(df.columns)}")
    
    return df

### Use

* Replace the desired paths

In [13]:
# Configuration
INPUT_FILE = Path("data/step1_preprocessed_final/person_level_integrated.parquet")
OUTPUT_FILE = Path("data/step2_engineered_final/person_level_engineered.parquet")

if not INPUT_FILE.exists():
    print(f"\nInput file not found: {INPUT_FILE}")
    print("Run Step 1 first")
else:
    df = pl.read_parquet(INPUT_FILE)
    print(f"Loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")
    
    # Run complete pipeline
    df_engineered = run_feature_engineering_pipeline(df, OUTPUT_FILE)

Loaded: 92,400 rows × 337 columns

TEMPORAL FEATURES
Created 6 temporal features

DEMOGRAPHIC FEATURES
Created 4 demographic features

RISK FACTOR FEATURES
Created 5 risk factor features

ENVIRONMENTAL FEATURES
Created 2 environmental features

VEHICLE FEATURES
Created 5 vehicle features

GEOGRAPHIC FEATURES
Created 3 geographic features

CRASH CHARACTERISTIC FEATURES
Created 4 crash characteristic features

OUTCOME FEATURES
Created 3 outcome features
Saved parquet to: data\step2_engineered_final\person_level_engineered.parquet
Saved feature list to: data\step2_engineered_final\step2_feature_list.txt
Final shape: 92,400 rows × 369 columns
Original features: 337
New features: 32
Total features: 369
