In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Loading data
accidents_df =  pd.read_csv(Path("../Resources/CRSS_data/accident/accident_2021.csv"))
person_df = pd.read_csv(Path("../Resources/CRSS_data/person/person_2021.csv"))
vehicle_df = pd.read_csv(Path("../Resources/CRSS_data/vehicle/vehicle_2021.csv"))

  vehicle_df = pd.read_csv(Path("../Resources/vehicle/vehicle_2021.csv"))


In [3]:
# Locating duplicate columns
a_columns = accidents_df.columns
p_columns = person_df.columns

dup_columns = []

for a_row in a_columns:
    for p_row in p_columns:
        if a_row == p_row:
            dup_columns.append(a_row)

# Removing CASENUM from list
dup_columns.remove(dup_columns[0])

In [4]:
# Dropping duplicate columns from person_df
person_df = person_df.drop(columns=dup_columns)

# Creating a list of columns where the content is dtypes objects
person_cat = person_df.dtypes[person_df.dtypes == "object"].index.tolist()

# Expand above list with additional irrelevant columns
person_cat.extend(["AGE_IM", "SEX_IM", "INJSEV_IM", "SEAT_IM", "REST_MIS", "HELM_MIS", "EJECT_IM", "PERALCH_IM", "HOSPITAL", "INJ_SEV", "MAK_MOD"])

# Drop unnecessary columns from dataframe
person_df = person_df.drop(columns=person_cat)

# Keep only rows where "PER_NO" is equal to one
person_df = person_df[(person_df["PER_NO"] == 1)]

# Preview dataframe
person_df.head()

Unnamed: 0,CASENUM,VEH_NO,PER_NO,MOD_YEAR,VPICMAKE,VPICMODEL,VPICBODYCLASS,MAKE,BODY_TYP,ICFINALBODY,...,ATST_TYP,ALC_RES,DRUGS,STR_VEH,LOCATION,SPEC_USE,EMER_USE,ROLLOVER,IMPACT1,FIRE_EXP
0,202102916823,1,1,2010.0,478.0,2302.0,13.0,35.0,4.0,0.0,...,0,996,0,0,0,0.0,0.0,0.0,12.0,0.0
1,202102918622,1,1,2006.0,483.0,1944.0,7.0,2.0,14.0,0.0,...,0,996,0,0,0,0.0,0.0,0.0,6.0,0.0
2,202102918654,1,1,2008.0,478.0,1907.0,13.0,35.0,4.0,0.0,...,0,996,0,0,0,0.0,0.0,0.0,12.0,0.0
3,202102918674,1,1,1999.0,448.0,2216.0,7.0,49.0,14.0,0.0,...,0,996,8,0,0,0.0,0.0,0.0,12.0,0.0
4,202102918674,2,1,2013.0,499.0,2299.0,13.0,63.0,4.0,0.0,...,95,995,8,0,0,0.0,0.0,0.0,3.0,0.0


In [5]:
# Creating a list of columns where the content is dtypes objects
accidents_cat = accidents_df.dtypes[accidents_df.dtypes == "object"].index.tolist()

# Expand above list with additional irrelevant columns
accidents_cat.extend(["PSU_VAR", "PSU", "PSUSTRAT", "VE_FORMS", "WKDY_IM", "YEARNAME", "HOUR_IM", "MINUTE_IM", "MINUTE_IMNAME", "EVENT1_IM", "MANCOL_IM", "RELJCT1_IM", "RELJCT2_IM", "LGTCON_IM", "WEATHR_IM", "MAXSEV_IM", "NO_INJ_IM", "ALCHL_IM", "NUM_INJ", "STRATUM", "WEIGHT"])

# Drop unnecessary columns from dataframe
accidents_df = accidents_df.drop(columns=accidents_cat)

# Preview dataframe
accidents_df.head()

Unnamed: 0,CASENUM,REGION,URBANICITY,PJ,PEDS,PERNOTMVIT,VE_TOTAL,PVH_INVL,PERMVIT,MONTH,...,RELJCT2,TYP_INT,REL_ROAD,WRK_ZONE,LGT_COND,WEATHER,SCH_BUS,INT_HWY,MAX_SEV,ALCOHOL
0,202102916823,3,1,1079,0,0,1,0,1,1,...,19,1,4,0,3,1,0,1,0,2
1,202102918622,4,2,4140,0,0,2,1,1,1,...,1,1,7,0,3,10,0,0,0,2
2,202102918654,3,2,4144,0,0,1,0,1,1,...,1,1,4,0,1,5,0,0,0,2
3,202102918674,3,1,4147,0,0,3,0,3,1,...,1,1,4,0,1,2,0,0,0,9
4,202102918705,1,2,4149,0,0,1,0,1,1,...,1,1,4,0,1,4,0,0,2,2


In [6]:
# Create new vehicle_df with relevant columns and preview dataframe
vehicle_df = vehicle_df[["CASENUM", "TRAV_SP", "HIT_RUN"]]
vehicle_df.head()

Unnamed: 0,CASENUM,TRAV_SP,HIT_RUN
0,202102916823,60,0
1,202102918622,5,0
2,202102918654,998,0
3,202102918674,998,1
4,202102918674,998,0


In [7]:
# Merge all three dataframes into one on "CASENUM"
data_df = accidents_df.merge(person_df, on="CASENUM", how="left")
data_df = data_df.merge(vehicle_df, on="CASENUM", how="left")

# Drop CASENUM column as it is only an identifier
data_df = data_df.drop(columns="CASENUM")

# Fill all null values with 0
data_df = data_df.fillna(0)

# Preview dataframe
data_df.head()

Unnamed: 0,REGION,URBANICITY,PJ,PEDS,PERNOTMVIT,VE_TOTAL,PVH_INVL,PERMVIT,MONTH,DAY_WEEK,...,DRUGS,STR_VEH,LOCATION,SPEC_USE,EMER_USE,ROLLOVER,IMPACT1,FIRE_EXP,TRAV_SP,HIT_RUN
0,3,1,1079,0,0,1,0,1,1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,60,0
1,4,2,4140,0,0,2,1,1,1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,5,0
2,3,2,4144,0,0,1,0,1,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,998,0
3,3,1,4147,0,0,3,0,3,1,6,...,8.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,998,1
4,3,1,4147,0,0,3,0,3,1,6,...,8.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,998,0


In [8]:
# Determining the label and features for Machine Learning
y = data_df["MAX_SEV"].values
X = data_df.drop(columns="MAX_SEV")

# Split data for testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Scale data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Fit and predict the data to the Random Forest model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
predictions = rf_model.predict(X_test_scaled)

In [11]:
# Check for accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8678903563216052

In [12]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:20]

[(0.06092094577892292, 'MINUTE'),
 (0.05636371707913037, 'PJ'),
 (0.05402689116499128, 'AGE'),
 (0.05363445845451859, 'HOUR'),
 (0.04722036939760792, 'AIR_BAG'),
 (0.047152090459887726, 'MONTH'),
 (0.045938267120067315, 'MOD_YEAR'),
 (0.04543993143467547, 'VPICMODEL'),
 (0.039849535219129824, 'DAY_WEEK'),
 (0.03597275814222962, 'MAKE'),
 (0.035792357197183315, 'VPICMAKE'),
 (0.034576070304317655, 'PERMVIT'),
 (0.02837701238200171, 'BODY_TYP'),
 (0.02706594880971751, 'MAN_COLL'),
 (0.02592622558647701, 'IMPACT1'),
 (0.024767989739966724, 'TRAV_SP'),
 (0.024262181199871325, 'VPICBODYCLASS'),
 (0.02229165109939745, 'VE_TOTAL'),
 (0.019808919189532617, 'RELJCT2'),
 (0.01902509286402368, 'TYP_INT')]