# Creating the mean collision severity array 👍  

## "la_clean.csv" database preprocessing

In [177]:
import pandas as pd
import numpy as np

df = pd.read_csv('la_clean.csv') # depends on where you saved the csv file
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950421 entries, 0 to 950420
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   case_id             950421 non-null  int64  
 1   primary_road        950421 non-null  object 
 2   weather_1           950421 non-null  object 
 3   collision_severity  950421 non-null  int64  
 4   latitude            950421 non-null  float64
 5   longitude           950421 non-null  float64
 6   hour                950421 non-null  int64  
 7   week_of_the_year    950421 non-null  int64  
 8   day_of_the_week     950421 non-null  int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 65.3+ MB


In [178]:
df.head()

Unnamed: 0,case_id,primary_road,weather_1,collision_severity,latitude,longitude,hour,week_of_the_year,day_of_the_week
0,3516974,RT 101,clear,1,34.17357,-118.54336,10,2,4
1,3522174,RT 405,clear,1,34.16803,-118.46901,6,3,3
2,3523649,RT 605,clear,1,33.77837,-118.0905,12,2,4
3,3524803,RT 101,cloudy,3,34.16938,-118.49952,6,1,3
4,3524807,RT 101,cloudy,2,34.16968,-118.49999,6,1,3


In [179]:
# for now we won't be needing the week of the year, as this would shrink too much our data when querying, nor the case_id
df = df.drop(['week_of_the_year'],axis=1).drop(['case_id'],axis=1)
df.head()

Unnamed: 0,primary_road,weather_1,collision_severity,latitude,longitude,hour,day_of_the_week
0,RT 101,clear,1,34.17357,-118.54336,10,4
1,RT 405,clear,1,34.16803,-118.46901,6,3
2,RT 605,clear,1,33.77837,-118.0905,12,4
3,RT 101,cloudy,3,34.16938,-118.49952,6,3
4,RT 101,cloudy,2,34.16968,-118.49999,6,3


In [180]:
df.collision_severity.value_counts()

1    634393
2    208638
3     86422
4     16289
5      4677
0         2
Name: collision_severity, dtype: int64

In [181]:
df.weather_1.value_counts()

clear      810429
cloudy     114406
raining     22494
unknown      1289
fog          1193
other         215
wind          203
snowing       192
Name: weather_1, dtype: int64

In [182]:
# we will (slightly arbitrarily) encode the weather type
weather = {'clear':0,'cloudy':1,'raining':2,'fog':3,'wind':2,'snowing':3,'other':0,'unknown':0}
def impute_weather(x):
    return weather[x]
df.weather_1=df.weather_1.apply(lambda x : impute_weather(x))
df.head()

Unnamed: 0,primary_road,weather_1,collision_severity,latitude,longitude,hour,day_of_the_week
0,RT 101,0,1,34.17357,-118.54336,10,4
1,RT 405,0,1,34.16803,-118.46901,6,3
2,RT 605,0,1,33.77837,-118.0905,12,4
3,RT 101,1,3,34.16938,-118.49952,6,3
4,RT 101,1,2,34.16968,-118.49999,6,3


In [183]:
print('Out of',df.shape[0],'observations in L.A., there are',len(df.primary_road.value_counts()),'unique roads.')

Out of 950421 observations in L.A., there are 66108 unique roads.


####  preprocessing the road names

In [184]:
def name_proc(x):
    x=str(x).lower()
    nam = {' blvd ':' boulevard ',' bl ':' boulevard ',' rd ':' road ',' str ':' street ',' av ':' avenue ',' rt ':' route ',' blvd':' boulevard ',' bl':' boulevard ',' rd':' road ',' str':' street ',' av':' avenue',' rt':' route '}
    for k in nam.keys():
        if k in x:
            return x.replace(k,nam[k])
    else:
        return x

df.primary_road = df.primary_road.apply(lambda x : name_proc(x)) 


## Features (dimensions of the array) : weekday, dayhour, weather, road

#### starting with an example 🌝 

In [186]:
# say the user of our front-end streamlite website wants to leave from 'Beverly boulevard' at 4 P.M. on a thursday
# Let's find this road, see if we have it in store for that day and hour, then assess the average collision severity there
def beverly(x):
    if 'beverly' in x.lower():
        return x
    else:
        return ''
a = df.copy()
a.primary_road = a.primary_road.apply(lambda x:beverly(x))
a=a[(a.primary_road != '')&(a.hour==16)&(a.day_of_the_week==4)]
a


Unnamed: 0,primary_road,weather_1,collision_severity,latitude,longitude,hour,day_of_the_week
14038,beverly boulevard,0,1,34.07612,-118.28675,16,4
36600,beverly boulevard,0,2,34.02124,-118.12599,16,4
325928,beverly boulevard,0,1,34.07457,-118.2869,16,4
454737,west beverly boulevard,0,1,34.01468,-118.1118,16,4
459176,west beverly boulevard,0,2,34.01705,-118.10499,16,4
470433,west beverly boulevard,0,2,34.02406,-118.13924,16,4
489493,beverly boulevard,0,2,34.0772,-118.38889,16,4
843335,beverly boulevard (5400 block),0,1,34.02896,-118.14845,16,4


In [187]:
# let's now refine the above search:
a.groupby('primary_road').agg({'collision_severity':'mean'})

Unnamed: 0_level_0,collision_severity
primary_road,Unnamed: 1_level_1
beverly boulevard,1.5
beverly boulevard (5400 block),1.0
west beverly boulevard,1.666667


In [188]:
print('We would then select the right road and provide the mean severity. Let us create the full array now')

We would then select the right road and provide the mean severity. Let us create the full array now


In [189]:
df[(df.day_of_the_week==4)&(df.hour==2)]

Unnamed: 0,primary_road,weather_1,collision_severity,latitude,longitude,hour,day_of_the_week
809,rt 405,0,1,34.23398,-118.47290,2,4
1181,rt 101,0,1,34.15607,-118.42605,2,4
1196,slauson avenue,0,1,33.96481,-118.06941,2,4
1326,rt 405,0,1,34.23530,-118.47286,2,4
1353,rt 91,0,2,33.95167,-117.38483,2,4
...,...,...,...,...,...,...,...
948955,beach boulevard,0,1,33.50580,-117.59500,2,4
949024,de forest cir,0,1,34.05055,-117.52177,2,4
949751,margarita road,0,2,33.53093,-117.15112,2,4
950044,tamarisk ln,0,3,34.02700,-117.65725,2,4


### (TBD for the enhanced version of the array: find most relevant features then add them as dimensions)

## Imputing missing values

### simple imputer: replace missing values with average collision severity on the same road, regardless of features (ie even if weather, hour etc. are different)

#### say I start my journey on margarita road on a sunday at 9am:

In [193]:
sevdf =df[(df.day_of_the_week==6)&(df.hour==9)&(df.primary_road=='margarita road')].groupby('primary_road').agg({'collision_severity':'mean'})
sevdf

Unnamed: 0_level_0,collision_severity
primary_road,Unnamed: 1_level_1


#### There is no data to retrieve because there was no such accident reported. Still, assuming no severity collision in the event of a crash is clearly wrong, so we need to impute a value here. We will use as a baseline an imputation based on data for the same road with different hours and days. 

In [194]:
corr_severity_hour = df.collision_severity.corr(df.hour)
corr_severity_hour

-0.013458493203326451

In [195]:
corr_severity_day = df.collision_severity.corr(df.day_of_the_week)
corr_severity_day

0.04555548303346449

In [196]:
# there is no linear correlation as seen above, but still there is some correlation

In [199]:
# let us see if by any chance we have a record at the exact time or weekday on the same road

In [197]:
df[(df.hour==9)&(df.primary_road=='margarita road')].groupby('primary_road').agg({'collision_severity':'mean'})

Unnamed: 0_level_0,collision_severity
primary_road,Unnamed: 1_level_1


In [198]:
df[(df.day_of_the_week==6)&(df.primary_road=='margarita road')].groupby('primary_road').agg({'collision_severity':'mean'})


Unnamed: 0_level_0,collision_severity
primary_road,Unnamed: 1_level_1


In [200]:
# we don't so let's impute in a straightforward manner

In [201]:
df[(df.primary_road=='margarita road')].groupby('primary_road').agg({'collision_severity':'mean'})


Unnamed: 0_level_0,collision_severity
primary_road,Unnamed: 1_level_1
margarita road,1


In [202]:
# we will send 1 as proxy for collision severity here

### (TBD for the enhanced version of the array: KNN imputer)