# IEOR 242 Final Project
# GNN Model

---

## Load Package

In [None]:
import os

import pandas as pd
import numpy as np

import category_encoders as ce

from sklearn.preprocessing import label_binarize, StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    accuracy_score
)

from sklearn.cluster import KMeans, DBSCAN

import torch
import torch.nn.functional as F
from torch.nn.functional import softmax
from torch_geometric.data import Data
from torch_geometric.utils import mask_select
from torch_geometric.nn import GCNConv, GATConv

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

---

## Load Data

In [None]:
directory = "/Users/yunxianghan/Desktop/jw/study/berkeley/ieor_242/final_project/data"
train_file_name = "collision_train_data.csv"
test_file_name = "collision_test_data.csv"

train_path = os.path.join(directory, train_file_name)
test_path = os.path.join(directory, test_file_name)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

Unnamed: 0,ACRS Report Type,Route Type,Road Name,Cross-Street Name,Weather,Surface Condition,Light,Driver Substance Abuse,Speed Limit,Vehicle Year,...,Driver At Fault,Vehicle Body Type,Vehicle Make,Vehicle Movement,Day of Week,Weekend,Time,Month,Year,Date in Month
0,Property Damage Crash,Maryland (State),GEORGIA AVE,ARCOLA AVE,CLEAR,DRY,DAWN,NONE DETECTED,-0.015398,2018,...,No,"CARGO VAN/LIGHT TRUCK 2 AXLES (OVER 10,000LBS ...",FREIGHTLINER,STOPPED IN TRAFFIC LANE,1,0,7,0,2020,21
1,Injury Crash,Maryland (State),WOODFIELD RD,WARFIELD RD,CLOUDY,DRY,DAYLIGHT,NONE DETECTED,-0.672483,2017,...,Yes,PASSENGER CAR,CHEV,MAKING LEFT TURN,4,0,13,11,2019,6
2,Property Damage Crash,Maryland (State),NORBECK RD,E GUDE DR,RAINING,WET,DAYLIGHT,NONE DETECTED,1.955855,2000,...,No,PASSENGER CAR,TOYOTA,MAKING LEFT TURN,2,0,15,2,2020,25
3,Injury Crash,County,SHADY GROVE RD,CRABBS BRANCH WAY,CLEAR,DRY,DARK LIGHTS ON,NONE DETECTED,0.641686,2015,...,No,"CARGO VAN/LIGHT TRUCK 2 AXLES (OVER 10,000LBS ...",INTL,MOVING CONSTANT SPEED,0,0,19,0,2017,9
4,Injury Crash,Maryland (State),GEORGIA AVE,ENT TO SHOPPING CENTER,RAINING,WET,DAYLIGHT,NONE DETECTED,-0.015398,2017,...,No,(SPORT) UTILITY VEHICLE,CHEVROLET,MOVING CONSTANT SPEED,3,0,11,3,2020,9


#### Check length

In [None]:
print(f'Length of train df:{len(train_df)}')
print(f'Length of test df:{len(test_df)}')

Length of train df:74856
Length of test df:32082


#### Get full df to build the graph

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106938 entries, 0 to 106937
Data columns (total 24 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ACRS Report Type        106938 non-null  object 
 1   Route Type              106938 non-null  object 
 2   Road Name               106938 non-null  object 
 3   Cross-Street Name       106938 non-null  object 
 4   Weather                 106938 non-null  object 
 5   Surface Condition       106938 non-null  object 
 6   Light                   106938 non-null  object 
 7   Driver Substance Abuse  106938 non-null  object 
 8   Speed Limit             106938 non-null  float64
 9   Vehicle Year            106938 non-null  int64  
 10  Latitude                106938 non-null  float64
 11  Longitude               106938 non-null  float64
 12  Traffic Control         106938 non-null  object 
 13  Collision Type          106938 non-null  object 
 14  Driver At Fault     

---

## Step I: Preprocess

#### Cat Features

***'Vehicle Body Type'***

In [None]:
'''
There are several duplicated level due to the Upper Lower Case.
'''
df['Vehicle Body Type'].value_counts()

PASSENGER CAR                                                71947
(SPORT) UTILITY VEHICLE                                      10343
PICKUP TRUCK                                                  4104
Passenger Car                                                 3421
VAN                                                           3136
TRANSIT BUS                                                   2245
SCHOOL BUS                                                    1580
OTHER LIGHT TRUCKS (10,000LBS (4,536KG) OR LESS)              1114
CARGO VAN/LIGHT TRUCK 2 AXLES (OVER 10,000LBS (4,536 KG))     1058
MEDIUM/HEAVY TRUCKS 3 AXLES (OVER 10,000LBS (4,536KG))         908
POLICE VEHICLE/NON EMERGENCY                                   881
Sport Utility Vehicle                                          828
OTHER                                                          801
POLICE VEHICLE/EMERGENCY                                       670
MOTORCYCLE                                                    

In [None]:
vehicle_type_mapping = {
    'PASSENGER CAR': 'Passenger Cars',
    'Passenger Car': 'Passenger Cars',
    'STATION WAGON': 'Passenger Cars',
    'Station Wagon': 'Passenger Cars',
    '(SPORT) UTILITY VEHICLE': 'SUVs',
    'Sport Utility Vehicle': 'SUVs',
    'PICKUP TRUCK': 'Pickup Trucks',
    'Pickup': 'Pickup Trucks',
    'VAN': 'Vans',
    'Van - Passenger (&lt;9 Seats)': 'Vans',
    'Van - Cargo': 'Vans',
    'TRANSIT BUS': 'Buses',
    'SCHOOL BUS': 'Buses',
    'Bus - Transit': 'Buses',
    'Bus - School': 'Buses',
    'OTHER BUS': 'Buses',
    'Bus - Other Type': 'Buses',
    'CROSS COUNTRY BUS': 'Buses',
    'MOTORCYCLE': 'Motorcycles and Mopeds',
    'Motorcycle - 2 Wheeled': 'Motorcycles and Mopeds',
    'MOPED': 'Motorcycles and Mopeds',
    'Moped Or motorized bicycle': 'Motorcycles and Mopeds',
    'OTHER LIGHT TRUCKS (10,000LBS (4,536KG) OR LESS)': 'Trucks',
    'CARGO VAN/LIGHT TRUCK 2 AXLES (OVER 10,000LBS (4,536 KG))': 'Trucks',
    'MEDIUM/HEAVY TRUCKS 3 AXLES (OVER 10,000LBS (4,536KG))': 'Trucks',
    'TRUCK TRACTOR': 'Trucks',
    'Truck Tractor': 'Trucks',
    'Single-Unit Truck': 'Trucks',
    'Other Trucks': 'Trucks',
    'POLICE VEHICLE/NON EMERGENCY': 'Emergency Vehicles',
    'POLICE VEHICLE/EMERGENCY': 'Emergency Vehicles',
    'AMBULANCE/EMERGENCY': 'Emergency Vehicles',
    'AMBULANCE/NON EMERGENCY': 'Emergency Vehicles',
    'FIRE VEHICLE/EMERGENCY': 'Emergency Vehicles',
    'FIRE VEHICLE/NON EMERGENCY': 'Emergency Vehicles',
    'RECREATIONAL VEHICLE': 'Recreational Vehicles',
    'SNOWMOBILE': 'Recreational Vehicles',
    'Snowmobile': 'Recreational Vehicles',
    'Recreational Off-Highway Vehicles (ROV)': 'Recreational Vehicles',
    'ALL TERRAIN VEHICLE (ATV)': 'Recreational Vehicles',
    'All-Terrain Vehicle/All-Terrain Cycle (ATV/ATC)': 'Recreational Vehicles',
    'FARM VEHICLE': 'Specialty Vehicles',
    'AUTOCYCLE': 'Specialty Vehicles',
    'LOW SPEED VEHICLE': 'Specialty Vehicles',
    'LIMOUSINE': 'Specialty Vehicles',
    'Construction Equipment (backhoe, bulldozer, etc.)': 'Specialty Vehicles',
    'UNKNOWN': 'Unknown',
    'Unknown': 'Unknown',
    'OTHER': 'Other',
    'Other': 'Other',
}

df['Vehicle Body Type'] = df['Vehicle Body Type'].map(vehicle_type_mapping).fillna('Other')

df['Vehicle Body Type'].value_counts()

Passenger Cars            75865
SUVs                      11171
Pickup Trucks              4362
Buses                      4242
Trucks                     3475
Vans                       3284
Emergency Vehicles         2165
Other                       834
Motorcycles and Mopeds      649
Unknown                     563
Recreational Vehicles       271
Specialty Vehicles           57
Name: Vehicle Body Type, dtype: int64

***'Route Type'***

In [None]:
df['Route Type'].value_counts()

Maryland (State)          51531
County                    36333
Municipality               5930
US (State)                 4865
County Route               2367
Interstate (State)         1873
Maryland (State) Route     1605
Other Public Roadway        831
Ramp                        528
Municipality Route          516
Government                  338
Local Route                 143
Government Route             33
Service Road                 28
Unknown                       8
Crossover                     6
Private Route                 3
Name: Route Type, dtype: int64

In [None]:
location_type_mapping = {
    'Maryland (State)': 'State Roads',
    'US (State)': 'State Roads',
    'Interstate (State)': 'State Roads',
    'Maryland (State) Route': 'State Roads',
    'County': 'County Roads',
    'County Route': 'County Roads',
    'Municipality': 'Municipality Roads',
    'Municipality Route': 'Municipality Roads',
    'Other Public Roadway': 'Other Public Roadways',
    'Local Route': 'Other Public Roadways',
    'Ramp': 'Other Public Roadways',
    'Service Road': 'Other Public Roadways',
    'Crossover': 'Other Public Roadways',
    'Government': 'Government Roads',
    'Government Route': 'Government Roads',
    'Private Route': 'Private Roads',
    'Unknown': 'Unknown',
}

df['Route Type'] = df['Route Type'].map(location_type_mapping)
df['Route Type'].value_counts()

State Roads              59874
County Roads             38700
Municipality Roads        6446
Other Public Roadways     1536
Government Roads           371
Unknown                      8
Private Roads                3
Name: Route Type, dtype: int64

***'Weather'***

In [None]:
df['Weather'].value_counts()

CLEAR                                74144
RAINING                              13737
CLOUDY                               11740
Clear                                 3919
SNOW                                   882
Rain                                   598
Cloudy                                 467
FOGGY                                  455
WINTRY MIX                             233
UNKNOWN                                207
OTHER                                  193
SLEET                                  127
SEVERE WINDS                           100
BLOWING SNOW                            71
Snow                                    23
Blowing Snow                            16
Freezing Rain Or Freezing Drizzle        7
Fog, Smog, Smoke                         7
Severe Crosswinds                        6
BLOWING SAND, SOIL, DIRT                 5
Unknown                                  1
Name: Weather, dtype: int64

In [None]:
weather_condition_mapping = {
    'CLEAR': 'Clear',
    'Clear': 'Clear',
    'RAINING': 'Rain',
    'Rain': 'Rain',
    'Freezing Rain Or Freezing Drizzle': 'Rain',
    'CLOUDY': 'Cloudy',
    'Cloudy': 'Cloudy',
    'SNOW': 'Snow',
    'Snow': 'Snow',
    'BLOWING SNOW': 'Snow',
    'Blowing Snow': 'Snow',
    'FOGGY': 'Foggy',
    'Fog, Smog, Smoke': 'Foggy',
    'WINTRY MIX': 'Wintry Mix',
    'SLEET': 'Wintry Mix',
    'SEVERE WINDS': 'Severe Winds',
    'Severe Crosswinds': 'Severe Winds',
    'BLOWING SAND, SOIL, DIRT': 'Severe Winds',
    'UNKNOWN': 'Unknown',
    'Unknown': 'Unknown',
    'OTHER': 'Other',
}

df['Weather'] = df['Weather'].map(weather_condition_mapping)
df['Weather'].value_counts()

Clear           78063
Rain            14342
Cloudy          12207
Snow              992
Foggy             462
Wintry Mix        360
Unknown           208
Other             193
Severe Winds      111
Name: Weather, dtype: int64

***'Surface Condition'***

In [None]:
df['Surface Condition'].value_counts()

DRY                         80332
WET                         19832
Dry                          4210
Wet                           781
ICE                           615
SNOW                          596
UNKNOWN                       216
SLUSH                         122
OTHER                         100
MUD, DIRT, GRAVEL              30
WATER(STANDING/MOVING)         26
OIL                            21
Snow                           19
Ice/Frost                      12
Slush                          12
Other                           8
SAND                            4
Water (standing, moving)        2
Name: Surface Condition, dtype: int64

In [None]:
surface_condition_mapping = {
    'DRY': 'Dry',
    'Dry': 'Dry',
    'WET': 'Wet',
    'Wet': 'Wet',
    'SNOW': 'Snow',
    'Snow': 'Snow',
    'ICE': 'Ice',
    'Ice/Frost': 'Ice',
    'SLUSH': 'Slush',
    'Slush': 'Slush',
    'MUD, DIRT, GRAVEL': 'Loose Material',
    'SAND': 'Loose Material',
    'WATER(STANDING/MOVING)': 'Water',
    'Water (standing, moving)': 'Water',
    'OIL': 'Oil',
    'UNKNOWN': 'Unknown',
    'OTHER': 'Other',
    'Other': 'Other',
}


df['Surface Condition'] = df['Surface Condition'].map(surface_condition_mapping)
df['Surface Condition'].value_counts()

Dry               84542
Wet               20613
Ice                 627
Snow                615
Unknown             216
Slush               134
Other               108
Loose Material       34
Water                28
Oil                  21
Name: Surface Condition, dtype: int64

***'Light'***

In [None]:
df['Light'].value_counts()

DAYLIGHT                    70454
DARK LIGHTS ON              23116
Daylight                     3596
DARK NO LIGHTS               2913
DUSK                         2253
DAWN                         2081
Dark - Lighted               1175
DARK -- UNKNOWN LIGHTING      791
OTHER                         147
UNKNOWN                       139
Dark - Not Lighted            109
Dusk                           69
Dawn                           60
Dark - Unknown Lighting        19
Other                          14
Unknown                         2
Name: Light, dtype: int64

In [None]:
light_mapping = {
    'DAYLIGHT': 'Daylight',
    'Daylight': 'Daylight',
    'DARK LIGHTS ON': 'Dark - Lighted',
    'Dark - Lighted': 'Dark - Lighted',
    'DARK NO LIGHTS': 'Dark - Not Lighted',
    'Dark - Not Lighted': 'Dark - Not Lighted',
    'DARK -- UNKNOWN LIGHTING': 'Dark - Unknown Lighting',
    'Dark - Unknown Lighting': 'Dark - Unknown Lighting',
    'DUSK': 'Dusk',
    'Dusk': 'Dusk',
    'DAWN': 'Dawn',
    'Dawn': 'Dawn',
    'OTHER': 'Other',
    'Other': 'Other',
    'UNKNOWN': 'Unknown',
    'Unknown': 'Unknown',
}

df['Light'] = df['Light'].map(light_mapping)
df['Light'].value_counts()

Daylight                   74050
Dark - Lighted             24291
Dark - Not Lighted          3022
Dusk                        2322
Dawn                        2141
Dark - Unknown Lighting      810
Other                        161
Unknown                      141
Name: Light, dtype: int64

In [None]:
df['Light'].value_counts()

Daylight                   74050
Dark - Lighted             24291
Dark - Not Lighted          3022
Dusk                        2322
Dawn                        2141
Dark - Unknown Lighting      810
Other                        161
Unknown                      141
Name: Light, dtype: int64

***'Driver Substance Abuse'***

In [None]:
'''
Present = The drug was detected but its role or impact is not specified.
Contributed = The drug was involved in influencing or causing an event.
'''
df['Driver Substance Abuse'].value_counts()

NONE DETECTED                                          92454
UNKNOWN                                                 5375
Not Suspect of Alcohol Use, Not Suspect of Drug Use     4734
ALCOHOL PRESENT                                         2617
ALCOHOL CONTRIBUTED                                      977
Unknown, Unknown                                         186
ILLEGAL DRUG PRESENT                                     164
Suspect of Alcohol Use, Not Suspect of Drug Use           92
MEDICATION PRESENT                                        76
ILLEGAL DRUG CONTRIBUTED                                  75
COMBINED SUBSTANCE PRESENT                                57
MEDICATION CONTRIBUTED                                    41
COMBINATION CONTRIBUTED                                   30
OTHER                                                     28
Suspect of Alcohol Use, Unknown                            8
Not Suspect of Alcohol Use, Unknown                        7
Unknown, Not Suspect of 

In [None]:
substance_abuse_mapping = {
    'NONE DETECTED': 'None Detected',
    'Not Suspect of Alcohol Use, Not Suspect of Drug Use': 'None Detected',
    'ALCOHOL PRESENT': 'Alcohol Present',
    'Suspect of Alcohol Use, Not Suspect of Drug Use': 'Alcohol Present',
    'Suspect of Alcohol Use, Unknown': 'Alcohol Present',
    'ALCOHOL CONTRIBUTED': 'Alcohol Contributed',
    'ILLEGAL DRUG PRESENT': 'Drug Present',
    'ILLEGAL DRUG CONTRIBUTED': 'Drug Contributed',
    'MEDICATION PRESENT': 'Medication Present',
    'MEDICATION CONTRIBUTED': 'Medication Contributed',
    'COMBINED SUBSTANCE PRESENT': 'Combined Substances Present',
    'COMBINATION CONTRIBUTED': 'Combined Substances Contributed',
    'UNKNOWN': 'Unknown',
    'Unknown, Unknown': 'Unknown',
    'Unknown, Not Suspect of Drug Use': 'Unknown',
    'Not Suspect of Alcohol Use, Unknown': 'Unknown',
    'OTHER': 'Other',
}

df['Driver Substance Abuse'] = df['Driver Substance Abuse'].map(substance_abuse_mapping)
df['Driver Substance Abuse'].value_counts()

None Detected                      97188
Unknown                             5574
Alcohol Present                     2717
Alcohol Contributed                  977
Drug Present                         164
Medication Present                    76
Drug Contributed                      75
Combined Substances Present           57
Medication Contributed                41
Combined Substances Contributed       30
Other                                 28
Name: Driver Substance Abuse, dtype: int64

***'Collision Type'***

In [None]:
df['Collision Type'].value_counts()

SAME DIR REAR END                35917
STRAIGHT MOVEMENT ANGLE          20543
SAME DIRECTION SIDESWIPE          9444
HEAD ON LEFT TURN                 9066
SINGLE VEHICLE                    8017
OTHER                             6709
SAME DIRECTION RIGHT TURN         2266
SAME DIRECTION LEFT TURN          2236
HEAD ON                           2105
Angle                             1704
ANGLE MEETS LEFT TURN             1405
OPPOSITE DIRECTION SIDESWIPE      1365
Front to Rear                     1055
ANGLE MEETS RIGHT TURN             786
Sideswipe, Same Direction          481
Rear To Side                       465
SAME DIR REND RIGHT TURN           456
SAME DIR BOTH LEFT TURN            449
ANGLE MEETS LEFT HEAD ON           436
SAME DIR REND LEFT TURN            412
Other                              398
Front to Front                     394
Single Vehicle                     315
Sideswipe, Opposite Direction      189
OPPOSITE DIR BOTH LEFT TURN        178
UNKNOWN                  

In [None]:
collision_mapping = {
    'SAME DIR REAR END': 'Same Direction Rear-End',
    'SAME DIR REND RIGHT TURN': 'Same Direction Rear-End',
    'SAME DIR REND LEFT TURN': 'Same Direction Rear-End',
    'STRAIGHT MOVEMENT ANGLE': 'Angle Collisions',
    'ANGLE MEETS LEFT TURN': 'Angle Collisions',
    'ANGLE MEETS RIGHT TURN': 'Angle Collisions',
    'Angle': 'Angle Collisions',
    'SAME DIRECTION SIDESWIPE': 'Same Direction Sideswipe',
    'Sideswipe, Same Direction': 'Same Direction Sideswipe',
    'HEAD ON LEFT TURN': 'Opposite Direction Collisions',
    'HEAD ON': 'Opposite Direction Collisions',
    'OPPOSITE DIRECTION SIDESWIPE': 'Opposite Direction Collisions',
    'Front to Front': 'Opposite Direction Collisions',
    'SINGLE VEHICLE': 'Single Vehicle',
    'Single Vehicle': 'Single Vehicle',
    'SAME DIRECTION RIGHT TURN': 'Turn Collisions',
    'SAME DIRECTION LEFT TURN': 'Turn Collisions',
    'SAME DIR BOTH LEFT TURN': 'Turn Collisions',
    'OPPOSITE DIR BOTH LEFT TURN': 'Turn Collisions',
    'OTHER': 'Other',
    'Other': 'Other',
    'UNKNOWN': 'Unknown',
    'Unknown': 'Unknown',
    'Rear To Rear': 'Other',  # I don't understand ???????
}


df['Collision Type'] = df['Collision Type'].map(collision_mapping)
df['Collision Type'].value_counts()

Same Direction Rear-End          36785
Angle Collisions                 24438
Opposite Direction Collisions    12930
Same Direction Sideswipe          9925
Single Vehicle                    8332
Other                             7128
Turn Collisions                   5129
Unknown                            126
Name: Collision Type, dtype: int64

***'Traffic Control'***

In [None]:
df['Traffic Control'].value_counts()

NO CONTROLS                                                                 44871
TRAFFIC SIGNAL                                                              43605
STOP SIGN                                                                    8868
Traffic Control Signal                                                       2852
FLASHING TRAFFIC SIGNAL                                                      1525
OTHER                                                                        1310
YIELD SIGN                                                                   1257
No Controls                                                                  1243
Stop Sign                                                                     598
PERSON                                                                        193
Flashing Traffic Control Signal                                               176
UNKNOWN                                                                       111
Lane Use Control

In [None]:
control_mapping = {
    'NO CONTROLS': 'No Controls',
    'No Controls': 'No Controls',
    'TRAFFIC SIGNAL': 'Traffic Signal',
    'Traffic Control Signal': 'Traffic Signal',
    'Flashing Traffic Control Signal': 'Traffic Signal',
    'FLASHING TRAFFIC SIGNAL': 'Traffic Signal',
    'Other Signal': 'Traffic Signal',
    'STOP SIGN': 'Stop Sign',
    'Stop Sign': 'Stop Sign',
    'YIELD SIGN': 'Yield Sign',
    'Yield Sign': 'Yield Sign',
    'PERSON': 'Pedestrian/Crossing Control',
    'Pedestrian Crossing Sign': 'Pedestrian/Crossing Control',
    'Pedestrian Crossing': 'Pedestrian/Crossing Control',
    'Person (including flagger, law enforcement, crossing guard, etc.)': 'Pedestrian/Crossing Control',
    'RAILWAY CROSSING DEVICE': 'Railway Crossing',
    'Flashing Railroad Crossing Signal (may include gates)': 'Railway Crossing',
    'WARNING SIGN': 'Warning Sign',
    'Intersection Ahead Warning Sign': 'Warning Sign',
    'Other Warning Sign': 'Warning Sign',
    'School Zone Sign': 'Warning Sign',
    'SCHOOL ZONE SIGN DEVICE': 'Warning Sign',
    'OTHER': 'Other',
    'Other': 'Other',
    'Other Pavement Marking (excluding edgelines, centerlines, or lane lines)': 'Other',
    'UNKNOWN': 'Unknown',
}

df['Traffic Control'] = df['Traffic Control'].map(control_mapping)
df['Traffic Control'].value_counts()

Traffic Signal                 48164
No Controls                    46114
Stop Sign                       9466
Other                           1330
Yield Sign                      1288
Pedestrian/Crossing Control      245
Unknown                          111
Railway Crossing                  29
Name: Traffic Control, dtype: int64

***'Vehicle Make'***

In [None]:
'''
Too many, ignore this feature
'''
df['Vehicle Make'].value_counts()

TOYOTA        15051
HONDA         12404
FORD          10330
NISSAN         5522
TOYT           5478
              ...  
MER-BENZ          1
BMV               1
HYUNDY            1
GILLLIG BU        1
VOLS              1
Name: Vehicle Make, Length: 1370, dtype: int64

***'Vehicle Movement'***

In [None]:
df['Vehicle Movement'].value_counts()

MOVING CONSTANT SPEED      41465
SLOWING OR STOPPING        15536
STOPPED IN TRAFFIC LANE    12091
MAKING LEFT TURN           11569
ACCELERATING                5353
MAKING RIGHT TURN           3339
STARTING FROM LANE          3116
CHANGING LANES              2942
Moving Constant Speed       2115
UNKNOWN                     1107
Turning Left                1078
BACKING                     1025
MAKING U TURN                791
ENTERING TRAFFIC LANE        512
PARKED                       500
PASSING                      492
STARTING FROM PARKED         480
SKIDDING                     479
Accelerating                 421
Slowing or Stopping          412
Stopped in Traffic           390
NEGOTIATING A CURVE          319
OTHER                        309
Turning Right                237
RIGHT TURN ON RED            188
PARKING                      144
LEAVING TRAFFIC LANE         126
Changing Lanes               114
Entering Traffic Lane        105
Making U-Turn                 53
Overtaking

In [None]:
movement_mapping = {
    'MOVING CONSTANT SPEED': 'Constant Speed',
    'Moving Constant Speed': 'Constant Speed',
    'SLOWING OR STOPPING': 'Slowing or Stopping',
    'Slowing or Stopping': 'Slowing or Stopping',
    'STOPPED IN TRAFFIC LANE': 'Slowing or Stopping',
    'Stopped in Traffic': 'Slowing or Stopping',
    'MAKING LEFT TURN': 'Making Left Turn',
    'Turning Left': 'Making Left Turn',
    'MAKING RIGHT TURN': 'Making Right Turn',
    'Turning Right': 'Making Right Turn',
    'RIGHT TURN ON RED': 'Making Right Turn',
    'MAKING U TURN': 'Making U-Turn',
    'Making U-Turn': 'Making U-Turn',
    'ACCELERATING': 'Accelerating',
    'Accelerating': 'Accelerating',
    'CHANGING LANES': 'Changing Lanes',
    'Changing Lanes': 'Changing Lanes',
    'ENTERING TRAFFIC LANE': 'Entering/Leaving Traffic Lane',
    'Entering Traffic Lane': 'Entering/Leaving Traffic Lane',
    'LEAVING TRAFFIC LANE': 'Entering/Leaving Traffic Lane',
    'Leaving Traffic Lane': 'Entering/Leaving Traffic Lane',
    'STARTING FROM LANE': 'Starting Vehicle',
    'STARTING FROM PARKED': 'Starting Vehicle',
    'PARKED': 'Parking or Backing',
    'PARKING': 'Parking or Backing',
    'BACKING': 'Parking or Backing',
    'Backing': 'Parking or Backing',
    'PASSING': 'Passing/Overtaking',
    'Overtaking/Passing': 'Passing/Overtaking',
    'SKIDDING': 'Skidding/Negotiating Curve',
    'NEGOTIATING A CURVE': 'Skidding/Negotiating Curve',
    'Negotiating a Curve': 'Skidding/Negotiating Curve',
    'DRIVERLESS MOVING VEH.': 'Driverless Vehicle',
    'UNKNOWN': 'Other/Unknown',
    'OTHER': 'Other/Unknown',
}

df['Vehicle Movement'] = df['Vehicle Movement'].map(movement_mapping)
df['Vehicle Movement'].value_counts()

Constant Speed                   43580
Slowing or Stopping              28429
Making Left Turn                 12647
Accelerating                      5774
Making Right Turn                 3764
Starting Vehicle                  3596
Changing Lanes                    3056
Parking or Backing                1698
Other/Unknown                     1416
Making U-Turn                      844
Skidding/Negotiating Curve         816
Entering/Leaving Traffic Lane      752
Passing/Overtaking                 530
Driverless Vehicle                  11
Name: Vehicle Movement, dtype: int64

***'Driver At Fault'***

In [None]:
df['Driver At Fault'].value_counts()

Yes        54422
No         50044
Unknown     2472
Name: Driver At Fault, dtype: int64

***Lower all cat features***

In [None]:
cat_columns = df.select_dtypes(include=['object']).columns

for col in cat_columns:
    df[col] = df[col].str.lower()

In [None]:
categorical_columns = df.select_dtypes(include='object').columns  # Select string (object) columns
df[categorical_columns] = df[categorical_columns].apply(lambda col: col.str.lower())

for col in categorical_columns:
    print(f"Value counts for column '{col}':")
    print(df[col].value_counts())
    print("\n")

Value counts for column 'ACRS Report Type':
property damage crash    64276
injury crash             42355
fatal crash                307
Name: ACRS Report Type, dtype: int64


Value counts for column 'Route Type':
state roads              59874
county roads             38700
municipality roads        6446
other public roadways     1536
government roads           371
unknown                      8
private roads                3
Name: Route Type, dtype: int64


Value counts for column 'Road Name':
georgia ave                                 7226
new hampshire ave                           4035
frederick rd                                3961
rockville pike                              3339
connecticut ave                             2988
                                            ... 
pepperwood la                                  1
milestone manor la                             1
fontaine st                                    1
ramp 3 fr shady grove rd eb to is 270 nb       1
ramp 1 fr

In [None]:
df.head()

Unnamed: 0,ACRS Report Type,Route Type,Road Name,Cross-Street Name,Weather,Surface Condition,Light,Driver Substance Abuse,Speed Limit,Vehicle Year,...,Driver At Fault,Vehicle Body Type,Vehicle Make,Vehicle Movement,Day of Week,Weekend,Time,Month,Year,Date in Month
0,property damage crash,state roads,georgia ave,arcola ave,clear,dry,dawn,none detected,-0.015398,2018,...,no,trucks,freightliner,slowing or stopping,1,0,7,0,2020,21
1,injury crash,state roads,woodfield rd,warfield rd,cloudy,dry,daylight,none detected,-0.672483,2017,...,yes,passenger cars,chev,making left turn,4,0,13,11,2019,6
2,property damage crash,state roads,norbeck rd,e gude dr,rain,wet,daylight,none detected,1.955855,2000,...,no,passenger cars,toyota,making left turn,2,0,15,2,2020,25
3,injury crash,county roads,shady grove rd,crabbs branch way,clear,dry,dark - lighted,none detected,0.641686,2015,...,no,trucks,intl,constant speed,0,0,19,0,2017,9
4,injury crash,state roads,georgia ave,ent to shopping center,rain,wet,daylight,none detected,-0.015398,2017,...,no,suvs,chevrolet,constant speed,3,0,11,3,2020,9


---

---

## Step II: Build Graph

#### Part I: Check for 'Road Name' and 'Cross-Street Name' combination

In [None]:
df

Unnamed: 0,ACRS Report Type,Route Type,Road Name,Cross-Street Name,Weather,Surface Condition,Light,Driver Substance Abuse,Speed Limit,Vehicle Year,...,Driver At Fault,Vehicle Body Type,Vehicle Make,Vehicle Movement,Day of Week,Weekend,Time,Month,Year,Date in Month
0,property damage crash,state roads,georgia ave,arcola ave,clear,dry,dawn,none detected,-0.015398,2018,...,no,trucks,freightliner,slowing or stopping,1,0,7,0,2020,21
1,injury crash,state roads,woodfield rd,warfield rd,cloudy,dry,daylight,none detected,-0.672483,2017,...,yes,passenger cars,chev,making left turn,4,0,13,11,2019,6
2,property damage crash,state roads,norbeck rd,e gude dr,rain,wet,daylight,none detected,1.955855,2000,...,no,passenger cars,toyota,making left turn,2,0,15,2,2020,25
3,injury crash,county roads,shady grove rd,crabbs branch way,clear,dry,dark - lighted,none detected,0.641686,2015,...,no,trucks,intl,constant speed,0,0,19,0,2017,9
4,injury crash,state roads,georgia ave,ent to shopping center,rain,wet,daylight,none detected,-0.015398,2017,...,no,suvs,chevrolet,constant speed,3,0,11,3,2020,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106933,property damage crash,county roads,quail valley blvd,bluebird terr,clear,dry,dark - not lighted,alcohol present,-1.329567,2008,...,yes,passenger cars,honda,constant speed,3,0,22,7,2018,9
106934,property damage crash,county roads,randolph rd,kemp mill rd,rain,wet,dark - lighted,medication present,0.641686,2009,...,no,passenger cars,honda,slowing or stopping,3,0,19,8,2021,16
106935,injury crash,state roads,old georgetown rd,tuckerman la,rain,wet,daylight,none detected,0.641686,2018,...,no,emergency vehicles,ford,other/unknown,5,1,13,11,2018,1
106936,injury crash,state roads,layhill rd,queensguard rd,clear,dry,dark - lighted,none detected,0.641686,2003,...,no,passenger cars,toyota,constant speed,3,0,18,9,2015,29


In [None]:
'''
Before build the graph, we have to mod data to make sure that
the combination of 'Road Name' and 'Cross-Street Name' in test
must appears in train.
'''
df['road_combo'] = df['Road Name'] + '_' + df['Cross-Street Name']
unique_combos = df['road_combo'].unique()

train_combos, test_combos = train_test_split(unique_combos, test_size=0.3, random_state=42)

train_df = df[df['road_combo'].isin(train_combos)]
test_df = df[df['road_combo'].isin(test_combos)]

train_df = train_df.drop(columns=['road_combo'])
test_df = test_df.drop(columns=['road_combo'])

In [None]:
train_df['ACRS Report Type'].value_counts()

property damage crash    44878
injury crash             29328
fatal crash                211
Name: ACRS Report Type, dtype: int64

In [None]:
test_df['ACRS Report Type'].value_counts()

property damage crash    19398
injury crash             13027
fatal crash                 96
Name: ACRS Report Type, dtype: int64

In [None]:
print(f'Length of train df:{len(train_df)}')
print(f'Length of test df:{len(test_df)}')

Length of train df:74417
Length of test df:32521


#### Part II: StandardScaler Num

In [None]:
# Normalize numerical features
numerical_cols = ['Speed Limit', 'Latitude', 'Longitude', 'Vehicle Year', 'Time',
                  'Day of Week', 'Date in Month', 'Month', 'Year', ]
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])  # Use same scaler

In [None]:
print(f'Length of train df:{len(train_df)}')
print(f'Length of test df:{len(test_df)}')

Length of train df:74417
Length of test df:32521


#### Part III: One-Hot encoding for cat

In [None]:
# Define categorical columns for one-hot encoding
categorical_cols = ['Weather', 'Surface Condition', 'Light', 'Driver Substance Abuse',
                    'Traffic Control', 'Collision Type', 'Driver At Fault',
                    'Vehicle Body Type', 'Vehicle Movement', 'Route Type']

In [None]:
# One-hot encoding for both train test
encoder = ce.OneHotEncoder(cols=categorical_cols, use_cat_names=True)

# Encode train and test separately
train_encoded = encoder.fit_transform(train_df[categorical_cols])
test_encoded = encoder.transform(test_df[categorical_cols])

***Drop original cat cols***

In [None]:
train_df = train_df.drop(columns=categorical_cols).reset_index(drop=True)
test_df = test_df.drop(columns=categorical_cols).reset_index(drop=True)

***Drop 'Vehicle Make' cols***

In [None]:
train_df = train_df.drop(columns=['Vehicle Make']).reset_index(drop=True)
test_df = test_df.drop(columns=['Vehicle Make']).reset_index(drop=True)

In [None]:
train_df = pd.concat([train_df, train_encoded.reset_index(drop=True)], axis=1)
test_df = pd.concat([test_df, test_encoded.reset_index(drop=True)], axis=1)

In [None]:
missing_cols = set(train_df.columns) - set(test_df.columns)
# Reorder
test_df = test_df[train_df.columns]

assert set(train_df.columns) == set(test_df.columns)

In [None]:
print(f'Length of train df:{len(train_df)}')
print(f'Length of test df:{len(test_df)}')

Length of train df:74417
Length of test df:32521


#### Part IV: Clustering feature engineering

In [None]:
train_coords = train_df[['Latitude', 'Longitude']].values
test_coords = test_df[['Latitude', 'Longitude']].values

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
train_df['Geo Cluster'] = dbscan.fit_predict(train_coords)
test_df['Geo Cluster'] = dbscan.fit_predict(test_coords)

In [None]:
# Encode y label
label_encoder = LabelEncoder()
train_labels = torch.tensor(label_encoder.fit_transform(train_df['ACRS Report Type']), dtype=torch.long)
test_labels = torch.tensor(label_encoder.transform(test_df['ACRS Report Type']), dtype=torch.long)

acrs_report_type_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("ACRS Report Type Mapping:")
print(acrs_report_type_mapping)

ACRS Report Type Mapping:
{0: 'fatal crash', 1: 'injury crash', 2: 'property damage crash'}


#### Part V: Build the graph nodes based on test df unique road name and cross street name

In [None]:
def build_graph_from_dataframe(df):
    node_features = torch.tensor(df.drop(columns=['Road Name', 'Cross-Street Name', 'ACRS Report Type']).values, dtype=torch.float)
    label_encoder = LabelEncoder()
    labels = torch.tensor(label_encoder.fit_transform(df['ACRS Report Type']), dtype=torch.long)

    # Build map
    road_map = {}
    cross_street_map = {}

    for idx, (road, cross_street) in enumerate(zip(df['Road Name'], df['Cross-Street Name'])):
        if road not in road_map:
            road_map[road] = []
        road_map[road].append(idx)

        if cross_street not in cross_street_map:
            cross_street_map[cross_street] = []
        cross_street_map[cross_street].append(idx)

    # Build graph
    edges = []
    for node_indices in road_map.values():
        for i in range(len(node_indices)):
            for j in range(i + 1, len(node_indices)):
                edges.append([node_indices[i], node_indices[j]])

    for node_indices in cross_street_map.values():
        for i in range(len(node_indices)):
            for j in range(i + 1, len(node_indices)):
                edges.append([node_indices[i], node_indices[j]])

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    graph_data = Data(x=node_features, edge_index=edge_index, y=labels)

    return graph_data, label_encoder

#### Part VI: Build the graph data for test

In [None]:
def build_graph_with_features_test(test_df, train_df, label_encoder):
    # Deal miss col
    missing_cols = set(train_df.columns) - set(test_df.columns)
    for col in missing_cols:
        test_df[col] = 0
    test_df = test_df[train_df.columns]

    node_features = torch.tensor(test_df.drop(columns=['Road Name', 'Cross-Street Name', 'ACRS Report Type']).values, dtype=torch.float)
    labels = torch.tensor(label_encoder.transform(test_df['ACRS Report Type']), dtype=torch.long)

    road_map = {}
    cross_street_map = {}

    for idx, (road, cross_street) in enumerate(zip(test_df['Road Name'], test_df['Cross-Street Name'])):
        if road not in road_map:
            road_map[road] = []
        road_map[road].append(idx)

        if cross_street not in cross_street_map:
            cross_street_map[cross_street] = []
        cross_street_map[cross_street].append(idx)

    edges = []
    for node_indices in road_map.values():
        for i in range(len(node_indices)):
            for j in range(i + 1, len(node_indices)):
                edges.append([node_indices[i], node_indices[j]])

    for node_indices in cross_street_map.values():
        for i in range(len(node_indices)):
            for j in range(i + 1, len(node_indices)):
                edges.append([node_indices[i], node_indices[j]])

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    graph_data = Data(x=node_features, edge_index=edge_index, y=labels)

    return graph_data


In [None]:
graph_data_train, label_encoder_train = build_graph_from_dataframe(train_df)

In [None]:
graph_data_test = build_graph_with_features_test(test_df, train_df, label_encoder_train)

---

## Step III: Training

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate):
        super(GNN, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(input_dim, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout_rate = dropout_rate

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

In [None]:
def objective(trial):
    # Hyperparameters
    hidden_dim = trial.suggest_int("hidden_dim", 16, 128, step=16)
    num_layers = trial.suggest_int("num_layers", 2, 4)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3)

    # Init
    input_dim = graph_data_train.x.shape[1]
    output_dim = len(label_encoder.classes_)
    model = GNN(input_dim, hidden_dim, output_dim, num_layers, dropout_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = F.nll_loss

    for epoch in range(50):  # Epochs
        model.train()
        optimizer.zero_grad()
        out = model(graph_data_train.x, graph_data_train.edge_index)
        loss = criterion(out[train_mask], graph_data_train.y[train_mask])
        loss.backward()
        optimizer.step()
        trial.report(loss.item(), step=epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # Val
    model.eval()
    with torch.no_grad():
        val_out = model(graph_data_train.x, graph_data_train.edge_index)
        val_pred = val_out[val_mask].argmax(dim=1).cpu().numpy()
        val_true = graph_data_train.y[val_mask].cpu().numpy()
        val_f1 = f1_score(val_true, val_pred, average="weighted")

    return val_f1

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)
print("Best F1-score:", study.best_value)

In [None]:
fig = vis.plot_optimization_history(study)
fig.show()

fig = vis.plot_intermediate_values(study)
fig.show()

In [None]:
'''
Store top 3 models
'''
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:3]
for i, trial in enumerate(top_trials):
    print(f"Top {i+1} Trial:")
    print(f"  Params: {trial.params}")
    print(f"  F1-Score: {trial.value}")

In [None]:
'''
Train top 3 models

'''
test_results = []

for i, trial in enumerate(top_trials):
    params = trial.params
    hidden_dim = params["hidden_dim"]
    num_layers = params["num_layers"]
    dropout_rate = params["dropout_rate"]
    learning_rate = params["learning_rate"]
    weight_decay = params["weight_decay"]

    model = GNN(graph_data_train.x.shape[1], hidden_dim, len(label_encoder.classes_), num_layers, dropout_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = F.nll_loss

    model.train()
    for epoch in range(50):
        optimizer.zero_grad()
        out = model(graph_data_train.x, graph_data_train.edge_index)
        loss = criterion(out, graph_data_train.y)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        test_out = model(graph_data_test.x, graph_data_test.edge_index)
        test_pred = test_out.argmax(dim=1).cpu().numpy()
        test_true = graph_data_test.y.cpu().numpy()

        test_acc = accuracy_score(test_true, test_pred)
        test_precision = precision_score(test_true, test_pred, average="weighted")
        test_recall = recall_score(test_true, test_pred, average="weighted")
        test_f1 = f1_score(test_true, test_pred, average="weighted")

        print(f"Model {i+1}: Test Metrics")
        print(f"  Accuracy: {test_acc:.6f}")
        print(f"  Precision: {test_precision:.6f}")
        print(f"  Recall: {test_recall:.6f}")
        print(f"  F1-Score: {test_f1:.6f}")

        test_results.append({
            "model": i + 1,
            "accuracy": test_acc,
            "precision": test_precision,
            "recall": test_recall,
            "f1_score": test_f1,
            "predictions": test_pred
        })

---