In [1]:
# COMPLETE PREPROCESSING IN ONE CELL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("VEHICLE CRASH SEVERITY - COMPLETE PREPROCESSING PIPELINE")
print("="*70)

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\n[1/8] Loading dataset...")
file_path = '../data/raw/US_Accidents_March23.csv'  # Update if different

# Sample if file is too large
import os
file_size_gb = os.path.getsize(file_path) / (1024**3)
print(f"File size: {file_size_gb:.2f} GB")

if file_size_gb > 2:
    print("Large file - sampling 500K rows...")
    df = pd.read_csv(file_path, nrows=500000)
else:
    df = pd.read_csv(file_path)

print(f"Loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

# ============================================================================
# STEP 2: CREATE BINARY TARGET
# ============================================================================
print("\n[2/8] Creating binary severity target...")
df['Severity_Binary'] = df['Severity'].apply(lambda x: 0 if x <= 2 else 1)
print(f"Target distribution:\n{df['Severity_Binary'].value_counts()}")

# ============================================================================
# STEP 3: SELECT RELEVANT FEATURES
# ============================================================================
print("\n[3/8] Selecting relevant features...")

features_to_keep = [
    'Severity_Binary',  # Target
    'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
    'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
    'Weather_Condition', 'Start_Time',
    'Junction', 'Traffic_Signal', 'Stop', 'Crossing',
    'Sunrise_Sunset', 'Civil_Twilight'
]

# Keep only features that exist
available = [f for f in features_to_keep if f in df.columns]
print(f"Selected {len(available)} features from {len(features_to_keep)} requested")

df = df[available].copy()
print(f"New shape: {df.shape}")

# ============================================================================
# STEP 4: HANDLE MISSING VALUES
# ============================================================================
print("\n[4/8] Handling missing values...")
print(f"Missing values before: {df.isnull().sum().sum():,}")

# Drop rows where target is missing
df = df.dropna(subset=['Severity_Binary'])
print(f"Rows after dropping missing target: {len(df):,}")

# Fill numeric columns with median
numeric_cols = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)',
                'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']

for col in numeric_cols:
    if col in df.columns and df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        
# Fill categorical with mode
cat_cols = ['Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight',
            'Junction', 'Traffic_Signal', 'Stop', 'Crossing']

for col in cat_cols:
    if col in df.columns and df[col].isnull().sum() > 0:
        mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown'
        df[col].fillna(mode_val, inplace=True)

print(f"Missing values after imputation: {df.isnull().sum().sum()}")

# ============================================================================
# STEP 5: ENCODE CATEGORICAL VARIABLES
# ============================================================================
print("\n[5/8] Encoding categorical variables...")

# Binary encoding
binary_maps = {
    'Sunrise_Sunset': {'Day': 1, 'Night': 0},
    'Civil_Twilight': {'Day': 1, 'Night': 0}
}

for col, mapping in binary_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping).fillna(0).astype(int)

# Boolean columns
bool_cols = ['Junction', 'Traffic_Signal', 'Stop', 'Crossing']
for col in bool_cols:
    if col in df.columns:
        # Convert True/False to 1/0
        df[col] = df[col].map({True: 1, False: 0, 'True': 1, 'False': 0}).fillna(0).astype(int)

# Weather - keep top 10, make rest "Other", then one-hot encode
if 'Weather_Condition' in df.columns:
    top_weather = df['Weather_Condition'].value_counts().head(10).index.tolist()
    df['Weather_Condition'] = df['Weather_Condition'].apply(
        lambda x: x if x in top_weather else 'Other'
    )
    df = pd.get_dummies(df, columns=['Weather_Condition'], prefix='Weather', drop_first=True)
    print(f"Weather encoded. Shape now: {df.shape}")

# ============================================================================
# STEP 6: ENGINEER TIME FEATURES
# ============================================================================
print("\n[6/8] Engineering time features...")

if 'Start_Time' in df.columns:
    df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
    df['Hour'] = df['Start_Time'].dt.hour
    df['DayOfWeek'] = df['Start_Time'].dt.dayofweek
    df['Month'] = df['Start_Time'].dt.month
    df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
    
    # Hour categories
    def hour_cat(h):
        if pd.isna(h): return 0
        if 6 <= h < 12: return 1
        elif 12 <= h < 18: return 2
        elif 18 <= h < 22: return 3
        else: return 4
    
    df['Hour_Category'] = df['Hour'].apply(hour_cat)
    df = df.drop('Start_Time', axis=1)
    print("Time features created: Hour, DayOfWeek, Month, IsWeekend, Hour_Category")

# Drop any remaining NaN rows
df = df.dropna()
print(f"Rows after dropping all NaN: {len(df):,}")

# ============================================================================
# STEP 7: BALANCE CLASSES
# ============================================================================
print("\n[7/8] Balancing classes...")

df_0 = df[df['Severity_Binary'] == 0]
df_1 = df[df['Severity_Binary'] == 1]

print(f"Before balancing:")
print(f"  Low (0): {len(df_0):,}")
print(f"  High (1): {len(df_1):,}")

# Only balance if we have enough data
if len(df) > 10000:
    if len(df_0) > len(df_1):
        df_0_down = resample(df_0, n_samples=len(df_1), random_state=42, replace=False)
        df_balanced = pd.concat([df_0_down, df_1])
    else:
        df_1_down = resample(df_1, n_samples=len(df_0), random_state=42, replace=False)
        df_balanced = pd.concat([df_0, df_1_down])
    
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"\nAfter balancing:")
    print(df_balanced['Severity_Binary'].value_counts())
else:
    print("⚠️ Not enough data to balance")
    df_balanced = df

# ============================================================================
# STEP 8: FINAL VALIDATION & SAVE
# ============================================================================
print("\n[8/8] Final validation...")

print("\n" + "="*70)
print("FINAL PREPROCESSED DATASET")
print("="*70)
print(f"Shape: {df_balanced.shape}")
print(f"Rows: {df_balanced.shape[0]:,}")
print(f"Features: {df_balanced.shape[1]}")
print(f"Missing values: {df_balanced.isnull().sum().sum()}")

print(f"\nTarget distribution:")
print(df_balanced['Severity_Binary'].value_counts())

# Verify all numeric
print(f"\nData types:")
print(df_balanced.dtypes.value_counts())

# Show features
features = [col for col in df_balanced.columns if col != 'Severity_Binary']
print(f"\nFeatures ({len(features)} total):")
for i, col in enumerate(features, 1):
    print(f"  {i:2d}. {col}")

# Save
output_file = '../data/processed/clean_crash_data.csv'
df_balanced.to_csv(output_file, index=False)

print("\n" + "="*70)
print("✅ PREPROCESSING COMPLETE!")
print("="*70)
print(f"Saved: {output_file}")
print("Ready for modeling!")

VEHICLE CRASH SEVERITY - COMPLETE PREPROCESSING PIPELINE

[1/8] Loading dataset...
File size: 2.85 GB
Large file - sampling 500K rows...
Loaded: 500,000 rows, 46 columns

[2/8] Creating binary severity target...
Target distribution:
Severity_Binary
0    312547
1    187453
Name: count, dtype: int64

[3/8] Selecting relevant features...
Selected 15 features from 15 requested
New shape: (500000, 15)

[4/8] Handling missing values...
Missing values before: 578,766
Rows after dropping missing target: 500,000
Missing values after imputation: 0

[5/8] Encoding categorical variables...
Weather encoded. Shape now: (500000, 24)

[6/8] Engineering time features...
Time features created: Hour, DayOfWeek, Month, IsWeekend, Hour_Category
Rows after dropping all NaN: 500,000

[7/8] Balancing classes...
Before balancing:
  Low (0): 312,547
  High (1): 187,453

After balancing:
Severity_Binary
0    187453
1    187453
Name: count, dtype: int64

[8/8] Final validation...

FINAL PREPROCESSED DATASET
Shape