In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Base DataFrame & collision scaling

In [3]:
df = pd.read_csv('../raw_data/clean.csv').drop(columns = ['primary_road','weather_1','latitude','longitude','week_of_the_year'])
df

Unnamed: 0,case_id,collision_severity,hour,day_of_the_week,routes
0,3516974,1,10,4,Ventura Freeway
1,3522174,1,6,3,San Diego Freeway
2,3524803,3,6,3,Ventura Freeway
3,3524807,2,6,3,Ventura Freeway
4,3524811,1,14,3,Ronald Reagan Freeway
...,...,...,...,...,...
718658,8078566,2,1,3,Foothill Freeway
718659,8165686,2,16,0,Hindry Avenue
718660,90326285,5,1,6,Chapman Avenue
718661,7205180,1,4,0,Pomona Freeway


### Scaling compris entre 1 et 31

In [4]:
def scaling(x):
    return 2**x - 1
    
df.collision_severity = df.collision_severity.apply(scaling)
df.collision_severity = df.collision_severity.apply(int)
df.head()

Unnamed: 0,case_id,collision_severity,hour,day_of_the_week,routes
0,3516974,1,10,4,Ventura Freeway
1,3522174,1,6,3,San Diego Freeway
2,3524803,7,6,3,Ventura Freeway
3,3524807,3,6,3,Ventura Freeway
4,3524811,1,14,3,Ronald Reagan Freeway


# if (road,hour,day) in [existing_combinations]:

#### i.e. si on a de la data pour ce tuple en question

In [7]:
def groupby_with_road(dataframe,list_roads,day,hour):
    """
    For tuples of (roads,day,hour) we have data on
    Returns a dictionnary with roads as keys and collision severity estimates as values
    """
    dict_predict = {}
    
    for road in list_roads:
        inter = dataframe[(dataframe['routes'] == road) \
                                    & (dataframe['day_of_the_week'] == day) \
                                    & (dataframe['hour'] == hour)]
        result = inter['collision_severity'].mean()
        dict_predict[road] = result
        
    return dict_predict

In [8]:
groupby_with_road(df,['Ventura Freeway'],3,17)

{'Ventura Freeway': 2.1012658227848102}

# if road not in [roads]

#### i.e. si la route renvoyée par l'API est inconnue

In [9]:
def groupby_without_road(dataframe,list_roads,day,hour):
    """
    For roads don't have
    Returns a dictionnary with roads as keys and collision severity estimates as values
    """
    dict_predict = {}
    
    inter = dataframe[(dataframe['day_of_the_week'] == day) \
                      & (dataframe['hour'] == hour)]
    result = inter['collision_severity'].mean()
    
    for road in list_roads:
        dict_predict[road] = result
        
    return dict_predict

In [10]:
groupby_without_road(df,['Dunnet Avenue'],3,17)

{'Dunnet Avenue': 2.278575283324339}

# if road in [roads]


#### i.e. si on a la route, mais pas de donnée pour le tuple (heure,jour)

In [11]:
def groupby_road(dataframe,list_roads,day,hour):
    """
    For roads we have, for which we don't have data on that date/hour
    Returns a dictionnary with roads as keys and collision severity estimates as values
    """
    inter = dataframe[dataframe['routes'].isin(list_roads)].copy().drop(columns = 'case_id')

    X = inter.drop(columns = ['collision_severity'])
    y = inter[['collision_severity']]

    pipe = make_pipeline(
        OneHotEncoder(sparse = False),
        LinearRegression()
    )

    pipe.fit(X, y)
    
    dict_predict = {}
    
    for road in list_roads:
        X_pred = [[hour,day,road]]
        result = pipe.predict(X_pred)
        dict_predict[road] = result[0][0]

    return dict_predict

In [12]:
list_roads = ['Hindry Avenue','Dunnet Avenue','Chapman Avenue']

groupby_road(df,list_roads,0,7)

{'Hindry Avenue': 3.1875, 'Dunnet Avenue': 2.875, 'Chapman Avenue': 2.1875}