# Categorization (Coarse Classification)

From the previous two notebooks, we can see that the data contains numerous categorical variables with many different levels. Each level of the category would create a dummy variable in our model, therefore there will be many redundant features in the model due to infrequent categories. We would like to create categories of values such that fewer parameters will have to be estimated and probably a more robust model can be obtained. We are going to categorize the values based on similar odds ratio.

In [52]:
%matplotlib inline

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

# Load data

In [53]:
df = pd.read_csv('../data/binary_class_data.csv')
df.head()

Unnamed: 0,x,y,environment,light,surface_condition,traffic_control,traffic_control_condition,collision_classification,impact_type,no_of_pedestrians,collision_year,collision_month,collision_day,collision_hour
0,357144.875,5020503.5,01 - Clear,07 - Dark,01 - Dry,10 - No control,,0,02 - Angle,0.0,2013.0,January,Friday,5
1,356860.1875,5013034.5,05 - Drifting Snow,01 - Daylight,06 - Ice,10 - No control,,0,01 - Approaching,0.0,2013.0,January,Sunday,5
2,368589.71875,5029516.5,03 - Snow,01 - Daylight,06 - Ice,02 - Stop sign,01 - Functioning,0,02 - Angle,0.0,2013.0,January,Thursday,5
3,370292.90625,5035187.0,01 - Clear,01 - Daylight,02 - Wet,02 - Stop sign,01 - Functioning,0,02 - Angle,0.0,2013.0,January,Saturday,5
4,372133.1875,5032130.5,02 - Rain,07 - Dark,02 - Wet,10 - No control,,0,06 - SMV unattended vehicle,0.0,2013.0,January,Friday,5


# 1. Environment

In [54]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'environment', aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [151.0, 530.76, 966.7, 2144.4, nan, nan, nan, 145.0, 27.0]


environment,00 - Unknown,01 - Clear,02 - Rain,03 - Snow,04 - Freezing Rain,05 - Drifting Snow,06 - Strong wind,"07 - Fog, mist, smoke, dust",99 - Other
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,151.0,81206.0,9667.0,10722.0,1455.0,527.0,166.0,290.0,27.0
1,1.0,153.0,10.0,5.0,,,,2.0,1.0


In [55]:
coll_dict = {'00 - Unknown':'Fog, mist, smoke, dust', 
             '07 - Fog, mist, smoke, dust':'Fog, mist, smoke, dust', 
             '01 - Clear':'Clear',
            '02 - Rain':'Clear', 
             '03 - Snow': 'Snow', 
             '04 - Freezing Rain':'Freezing rain',
            '05 - Drifting Snow':'Freezing rain',
             '06 - Strong wind': 'Freezing rain',
             '99 - Other': 'Other'}

df['environment'].replace(coll_dict, inplace = True)

df.environment.value_counts()

Clear                     91036
Snow                      10727
Freezing rain              2148
Fog, mist, smoke, dust      444
Other                        28
Name: environment, dtype: int64

# 2. Light

In [56]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'light', aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [nan, 722.96, nan, 355.0, nan, 952.4, nan, 380.26, nan, nan]


light,00 - Unknown,01 - Daylight,"02 - Daylight, artificial",03 - Dawn,"04 - Dawn, artificial",05 - Dusk,"06 - Dusk, artificial",07 - Dark,"08 - Dark, artificial",99 - Other
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2443.0,70850.0,1.0,2485.0,6.0,4762.0,3.0,23576.0,73.0,13.0
1,,98.0,,7.0,,5.0,,62.0,,


In [57]:
coll_dict = {'00 - Unknown':'Daylight, artificial', 
             '01 - Daylight':'Daylight', 
             '02 - Daylight, artificial':'Daylight, artificial',
            '03 - Dawn':'Dark', 
             '04 - Dawn, artificial': 'Daylight, artificial', 
             '05 - Dusk':'Daylight',
            '06 - Dusk, artificial':'Daylight, artificial',
             '07 - Dark': 'Dark',
             '08 - Dark, artificial': 'Daylight, artificial',
            '99 - Other': 'Daylight, artificial'}

df['light'].replace(coll_dict, inplace = True)

df.light.value_counts()

Daylight                75715
Dark                    26130
Daylight, artificial     2539
Name: light, dtype: int64

# 3. Surface condition

In [58]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'surface_condition', aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [57.5, 497.46, 683.41, 2294.0, nan, 1387.0, 2411.5, nan, nan, nan, nan]


surface_condition,00 - Unknown,01 - Dry,02 - Wet,03 - Loose snow,04 - Slush,05 - Packed snow,06 - Ice,07 - Mud,08 - Loose sand or gravel,09 - Spilled liquid,99 - Other
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,115.0,67654.0,18452.0,6882.0,3324.0,2774.0,4823.0,11.0,99.0,10.0,68.0
1,2.0,136.0,27.0,3.0,,2.0,2.0,,,,


In [59]:
coll_dict = {'00 - Unknown':'Unknown', 
             '01 - Dry':'Dry', 
             '02 - Wet':'Dry',
            '03 - Loose snow':'Ice', 
             '04 - Slush': 'Slush', 
             '05 - Packed snow':'Ice',
            '06 - Ice':'Ice',
             '07 - Mud': 'Slush',
             '08 - Loose sand or gravel': 'Slush',
             '09 - Spilled liquid': 'Slush',
            '99 - Other': 'Slush'}

df['surface_condition'].replace(coll_dict, inplace = True)

df.surface_condition.value_counts()

Dry        86269
Ice        14486
Slush       3512
Unknown      117
Name: surface_condition, dtype: int64

# 4. Traffic control

In [60]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'traffic_control', aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [875.06, 524.38, nan, nan, nan, 31.0, nan, 488.49, nan, nan, nan, nan]


traffic_control,01 - Traffic signal,02 - Stop sign,03 - Yield sign,04 - Ped. crossover,07 - School bus,08 - Traffic gate,09 - Traffic controller,10 - No control,11 - Roundabout,12 - IPS,13 - MPS,99 - Other
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,41128.0,11012.0,703.0,10.0,7.0,31.0,23.0,50314.0,875.0,35.0,9.0,17.0
1,47.0,21.0,,,,1.0,,103.0,,,,


In [61]:
coll_dict = {
             '01 - Traffic signal':'Traffic signal', 
             '02 - Stop sign':'Stop sign',
            '03 - Yield sign':'Yield sign', 
             '04 - Ped. crossover': 'Yield sign', 
             '05 - Packed snow':'Ice',
            '06 - Ice':'Ice',
             '07 - School bus': 'Yield sign',
             '08 - Traffic gate': 'Traffic gate',
             '09 - Traffic controller': 'Yield sign',
            '10 - No control':'Stop sign', 
            '11 - Roundabout':'Yield sign',
            '12 - IPS':'Yield sign', 
            '13 - MPS':'Yield sign', 
            '99 - Other': 'Yield sign'
}

df['traffic_control'].replace(coll_dict, inplace = True)

df.traffic_control.value_counts()

Stop sign         61450
Traffic signal    41175
Yield sign         1679
Traffic gate         32
Name: traffic_control, dtype: int64

# 5. Traffic control condition

In [62]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'traffic_control_condition',
                     aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [1668.0, 731.28, nan, nan, nan]


traffic_control_condition,00 - Unknown,01 - Functioning,02 - Not functioning,03 - Obscured,04 - Missing/Damaged
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3336.0,42414.0,231.0,31.0,12.0
1,2.0,58.0,,,


In [63]:
coll_dict = {
            '00 - Unknown': 'Unknown',
             '01 - Functioning':'Functioning', 
             '02 - Not functioning':'Not functioning',
            '03 - Obscured':'Not functioning', 
             '04 - Missing/Damaged': 'Not functioning'
            
}

df['traffic_control_condition'].replace(coll_dict, inplace = True)

df.traffic_control_condition.value_counts()

Functioning        42472
Unknown             3338
Not functioning      274
Name: traffic_control_condition, dtype: int64

# 6. Impact type

In [64]:
pvt = df.pivot_table('x', index = 'collision_classification', columns = 'impact_type', aggfunc = 'count')
print("Odd ratio:", list(np.around(pvt.values[0]/pvt.values[1],2)))
pvt

Odd ratio: [85.39, 599.62, 4407.25, 2339.67, 482.43, 4223.0, 206.91, 275.44]


impact_type,01 - Approaching,02 - Angle,03 - Rear end,04 - Sideswipe,05 - Turning movement,06 - SMV unattended vehicle,07 - SMV other,99 - Other
collision_classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1537,14391,35258,14038,11096,8446,16967,2479
1,18,24,8,6,23,2,82,9


In [65]:
coll_dict = {
             '01 - Approaching':'Approaching', 
             '02 - Angle':'Angle',
            '03 - Rear end':'Rear end', 
             '04 - Sideswipe': 'Sideswipe', 
             '05 - Turning movement':'Angle',
            '06 - SMV unattended vehicle':'Rear end',
             '07 - SMV other': 'Other',
            '99 - Other': 'Other'
}

df['impact_type'].replace(coll_dict, inplace = True)

df.impact_type.value_counts()

Rear end       43714
Angle          25534
Other          19537
Sideswipe      14044
Approaching     1555
Name: impact_type, dtype: int64

# Save data as csv

In [66]:
df.to_csv('../data/ml_data.csv', index = False)