# Categorization (Coarse Classification)

From 01-data_cleaning.ipynb we can see that the data contains numerous categorical variables with many different levels. Each level of the category would create a dummy variable in our model, therefore there will be many redundant features in the model due to infrequent categories. We would like to create categories of values such that fewer parameters will have to be estimated and probably a more robust model can be obtained. We are going to categorize the values based on similar odds ratio.

In [1]:
%matplotlib inline

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

# Load data

In [8]:
# Load data
df = pd.read_csv('../data/clean_data.csv')
df.columns = [x.lower() for x in df.columns] # column names in lower case
df.head()

Unnamed: 0,c_year,c_mnth,c_wday,c_hour,fatal,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,33.0,Driver,Injury,Safety device used,Motor Vehicle Driver,2890
1,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,70.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,2890
2,1999,January,Monday,20.0,0,1.0,Ran off left shoulder,Intersection with parking lot entrance,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1988.0,1.0,F,38.0,Driver,Injury,Safety device used,Motor Vehicle Driver,4332
3,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,1.0,M,34.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,5053
4,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,2.0,M,30.0,"Front row, right outboard",No Injury,Safety device used,Motor Vehicle Passenger,5053


## 1. Collision configuration
We will keep only four categories with the largest odds and aggregate others

In [9]:
pvt = df.pivot_table('c_case', index = 'c_conf', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4),
                    np.around(pvt.values[9][0]/pvt.values[9][1],4),
                    np.around(pvt.values[10][0]/pvt.values[10][1],4),
                    np.around(pvt.values[11][0]/pvt.values[11][1],4),
                    np.around(pvt.values[12][0]/pvt.values[12][1],4),
                    np.around(pvt.values[13][0]/pvt.values[13][1],4),
                    np.around(pvt.values[14][0]/pvt.values[14][1],4),
                    np.around(pvt.values[15][0]/pvt.values[15][1],4),
                    np.around(pvt.values[16][0]/pvt.values[16][1],4),
                    np.around(pvt.values[17][0]/pvt.values[17][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_conf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Any other single-vehicle,333464,8621,38.6804
Any other two-vehicle - different direction,401728,2545,157.8499
Any other two-vehicle - same direction,5838,55,106.1455
Approaching side-swipe,24730,1144,21.6171
Head-on collision,127288,16910,7.5274
Hit a moving object,34365,883,38.9185
Hit a parked motor vehicle,29731,524,56.7385
Hit a stationary object,82363,2163,38.0781
Left turn across opposing traffic,290097,2952,98.2713
Ran off left shoulder,96921,3804,25.4787


In [10]:
coll_dict = {'Hit a moving object':'other', 
             'Hit a stationary object':'other', 
             'Ran off left shoulder':'other',
            'Ran off right shoulder':'other', 
             'Rollover on roadway': 'Rollover on roadway', 
             'Any other single-vehicle ':'other',
            'Right turn':'other',
             'Head-on collision': 'Head-on collision',
             'Rear-end collision': 'Rear-end collision', 
            'left turn conflict':'other',  
             'Left turn across opposing traffic':'other',
            'right turn conflict':'other', 
             'Right angle collision':'other', 
             'Hit a parked motor vehicle':'other',
            'Approaching side-swipe':'other', 
            'Any other two-vehicle - different direction': 'Any other two-vehicle - different direction', 
             'Side swipe':'other', 
            'Any other two-vehicle - same direction':'other'}

df['c_conf'].replace(coll_dict, inplace = True)

df.c_conf.value_counts()

other                                          1932149
Rear-end collision                             1328343
Any other two-vehicle - different direction     404273
Head-on collision                               144198
Rollover on roadway                               8650
Name: c_conf, dtype: int64

## 2. Roadway configuration

In [11]:
pvt = df.pivot_table('c_case', index = 'c_rcfg', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4),
                    np.around(pvt.values[9][0]/pvt.values[9][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_rcfg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
At an intersection,1996193,18586,107.403
"Bridge, overpass, viaduct",34319,682,50.3211
Express lane of a freeway system,249,2,124.5
Intersection with parking lot entrance,217095,1822,119.152
Non-intersection,1483962,39738,37.3437
Passing or climbing lane,318,46,6.913
Railroad level crossing,11561,277,41.7365
Ramp,7160,25,286.4
Traffic circle,1292,7,184.5714
Tunnel or underpass,4155,124,33.5081


In [12]:
roadway_dict = {'Non-intersection': 'other', 
                'At an intersection':'At an intersection', 
                'Intersection with parking lot entrance': 'Intersection with parking lot entrance',
                'Railroad level crossing': 'other',
               'Bridge, overpass, viaduct': 'other', 
                'Tunnel or underpass': 'other', 
                'Passing or climbing lane': 'other',
               'Ramp': 'Express lane of a freeway system', 
                'Traffic circle': 'Express lane of a freeway system',
                'Express lane of a freeway system': 'Express lane of a freeway system'}

df['c_rcfg'].replace(roadway_dict, inplace = True)

df.c_rcfg.value_counts()

At an intersection                        2014779
other                                     1575182
Intersection with parking lot entrance     218917
Express lane of a freeway system             8735
Name: c_rcfg, dtype: int64

## 3. Weather condition

In [13]:
pvt = df.pivot_table('c_case', index = 'c_wthr', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_wthr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clear and sunny,2688648,42254,63.6306
"Freezing rain, sleet, hail",19975,508,39.3209
"Overcast, cloudy but no precipitation",346496,6852,50.5686
Raining,416193,4998,83.2719
Snowing,222200,4175,53.2216
Strong wind,9410,296,31.7905
Visibility limitation,53382,2226,23.9811


In [14]:
wthr_dict = {'Clear and sunny': 'Clear and sunny',
             'Overcast, cloudy but no precipitation': 'Overcast, cloudy but no precipitation',
              'Raining': 'Raining',
            'Snowing' : 'Snowing', 
             'Freezing rain, sleet, hail' :'other',
              'Visibility limitation':'Visibility limitation',
              'Strong wind': 'other'}

df['c_wthr'].replace(wthr_dict, inplace = True)

df.c_wthr.value_counts()

Clear and sunny                          2730902
Raining                                   421191
Overcast, cloudy but no precipitation     353348
Snowing                                   226375
Visibility limitation                      55608
other                                      30189
Name: c_wthr, dtype: int64

## 4. Road surface

In [15]:
pvt = df.pivot_table('c_case', index = 'c_rsur', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_rsur,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Dry, normal",2564900,42308,60.6245
Flooded,178,3,59.3333
Icy,204511,4137,49.4346
Muddy,2908,104,27.9615
Oil,695,3,231.6667
Sand/gravel/dirt,15694,730,21.4986
Slush,53899,1071,50.3259
Snow,167814,3221,52.1
Wet,745705,9732,76.624


In [16]:
roadsur_dict = {'Dry, normal':'Normal', 
                'Wet': 'Wet', 
                'Snow': 'Snow', 
                'Slush':'Snow', 
                'Icy': 'other', 
                'Sand/gravel/dirt':'other',
                'Muddy':'other', 
                 'Oil': 'Oil', 
                'Flooded': 'Normal'}

df['c_rsur'].replace(roadsur_dict, inplace = True)

df.c_rsur.value_counts()

Normal    2607389
Wet        755437
other      228084
Snow       226005
Oil           698
Name: c_rsur, dtype: int64

## 5. Road alignment

In [17]:
pvt = df.pivot_table('c_case', index = 'c_raln', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_raln,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bottom of hill or gradient,16170,577,28.0243
Curved and level,235110,8949,26.2722
Curved with gradient,138931,5668,24.5115
Straight and level,2944003,37653,78.1877
Straight with gradient,398981,7703,51.7955
Top of hill or gradient,23109,759,30.4466


In [18]:
roadall_dist = {'Straight and level':'Straight and level',
                'Straight with gradient':'Straight with gradient', 
                'Curved and level': 'Curved',
               'Curved with gradient':'Curved', 
                'Top of hill or gradient': 'Top of hill or gradient',
                'Bottom of hill or gradient': 'Bottom of hill or gradient'}

df['c_raln'].replace(roadall_dist, inplace = True)

df.c_raln.value_counts()

Straight and level            2981656
Straight with gradient         406684
Curved                         388658
Top of hill or gradient         23868
Bottom of hill or gradient      16747
Name: c_raln, dtype: int64

## 6. Vehicle type

In [19]:
pvt = df.pivot_table('c_case', index = 'v_type', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4),
                    np.around(pvt.values[9][0]/pvt.values[9][1],4),
                    np.around(pvt.values[10][0]/pvt.values[10][1],4),
                    np.around(pvt.values[11][0]/pvt.values[11][1],4),
                    np.around(pvt.values[12][0]/pvt.values[12][1],4)
                   ]
pvt

fatal,0,1,odd ratio
v_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bicycle,3954,18,219.6667
Fire engine,628,15,41.8667
Light Duty Vehicle,3400890,49062,69.3182
Motorcycle and moped,61385,2527,24.2917
Other trucks and vans,109832,2444,44.9394
Panel/cargo van,45909,743,61.7887
Purpose-built motorhome,1572,102,15.4118
Road tractor,39039,3051,12.7955
School bus,14284,518,27.5753
Smaller school bus,739,7,105.5714


In [20]:
vehtype_dict ={'Light Duty Vehicle': 'Light Duty Vehicle', 
               'Other trucks and vans': 'Other trucks and vans', 
               'Urban and Intercity Bus': 'Urban and Intercity Bus',
              'Construction equipment': 'Construction equipment',
               'Bicycle': 'Bicycle', 
               'Unit trucks': 'other', 
               'Road tractor': 'Road tractor',
              'School bus': 'other', 
               'Snowmobile': 'Snowmobile',
               'Motorcycle and moped': 'other',
               'Street car': 'Street car',
              'Panel/cargo van': 'Light Duty Vehicle', 
               'Off road vehicles': 'Off road vehicles', 
               'Farm equipment': 'Farm equipment', 
               'Purpose-built motorhome': 'Road tractor',
              'Smaller school bus': 'Street car',
               'Fire engine':'Other trucks and vans'}

df['v_type'].replace(vehtype_dict, inplace = True)

df.v_type.value_counts()

Light Duty Vehicle         3496604
other                       125031
Other trucks and vans       112919
Road tractor                 43764
Urban and Intercity Bus      32819
Bicycle                       3972
Street car                    2504
Name: v_type, dtype: int64

## 7. Traffic control

In [21]:
pvt = df.pivot_table('c_case', index = 'c_traf', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4),
                    np.around(pvt.values[9][0]/pvt.values[9][1],4),
                    np.around(pvt.values[10][0]/pvt.values[10][1],4),
                    np.around(pvt.values[11][0]/pvt.values[11][1],4),
                    np.around(pvt.values[12][0]/pvt.values[12][1],4),
                    np.around(pvt.values[13][0]/pvt.values[13][1],4),
                    np.around(pvt.values[14][0]/pvt.values[14][1],4),
                    np.around(pvt.values[15][0]/pvt.values[15][1],4),
                    np.around(pvt.values[16][0]/pvt.values[16][1],4)
                   ]
pvt

fatal,0,1,odd ratio
c_traf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control device not specified,711.0,21.0,33.8571
Markings on the road,2356.0,118.0,19.9661
No control present,2000882.0,47500.0,42.1238
No passing zone sign,1512.0,63.0,24.0
Pedestrian crosswalk,17167.0,93.0,184.5914
Police officer,1504.0,11.0,136.7273
"Railway crossing with signals, or signals and gates",2159.0,106.0,20.3679
Railway crossing with signs only,488.0,54.0,9.037
Reduced speed zone,1247.0,123.0,10.1382
School bus stopped with school bus signal lights flashing,3627.0,16.0,226.6875


In [22]:
tracon_dict ={'Traffic signals fully operational': 'Traffic signals fully operational',
              'Traffic signals in flashing mode': 'No control present',
             'Stop sign': 'Stop sign',
              'Yield sign': 'Yield sign',
              'Warning sign':'other',
              'Pedestrian crosswalk':'Pedestrian crosswalk',
             'Police officer': 'Police officer', 
              'School guard, flagman': 'All School buses traffic control', 
              'School crossing': 'All School buses traffic control',
              'Reduced speed zone': 'Railway crossing with signs only', 
              'No passing zone sign': 'other', 
              'Markings on the road': 'other',
             'School bus stopped with school bus signal lights flashing': 'All School buses traffic control',
             'Railway crossing with signals, or signals and gates':'other',
             'Railway crossing with signs only': 'Railway crossing with signs only',
             'Control device not specifie': 'Control device not specified',
              'No control present': 'No control present'}

df['c_traf'].replace(tracon_dict, inplace = True)

df.c_traf.value_counts()

No control present                   2062473
Traffic signals fully operational    1184092
Stop sign                             474032
Yield sign                             45685
All School buses traffic control       21367
Pedestrian crosswalk                   17260
other                                   8545
Railway crossing with signs only        1912
Police officer                          1515
Control device not specified             732
Name: c_traf, dtype: int64

## 8. Person position

In [23]:
pvt = df.pivot_table('c_case', index = 'p_psn', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4),
                    np.around(pvt.values[6][0]/pvt.values[6][1],4),
                    np.around(pvt.values[7][0]/pvt.values[7][1],4),
                    np.around(pvt.values[8][0]/pvt.values[8][1],4),
                    np.around(pvt.values[9][0]/pvt.values[9][1],4),
                    np.around(pvt.values[10][0]/pvt.values[10][1],4),
                    np.around(pvt.values[11][0]/pvt.values[11][1],4)
                   ]
pvt

fatal,0,1,odd ratio
p_psn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Driver,2546515,37938,67.1231
"Front row, center",50332,1068,47.1273
"Front row, right outboard",675293,11985,56.3448
Outside passenger compartment,345,6,57.5
Position unknown,13807,917,15.0567
"Second row, center",66306,1440,46.0458
"Second row, left outboard",168619,3264,51.6602
"Second row, right outboard",207805,3532,58.8349
Sitting on someone’s lap,211,13,16.2308
"Third row, center",23001,999,23.024


In [24]:
perpos_dict = {'Driver': 'Driver', 
               'Front row, right outboard': 'Front row',
               'Pedestrian': 'Pedestrian',
               'Second row, right outboard': 'Second row',
              'Second row, right outboard': 'Outside passenger compartment', 
               'Second row, left outboard': 'Second row',
              'Second row, center': 'Second row', 
               'Front row, center': 'Front row',
               'Position unknown': 'Position unknown',
              'Third row, center': 'Third row', 
               'Third row, left outboard': 'Third row',
              'Third row, right outboard': 'Third row', 
               'Sitting on someone’s lap': 'Position unknown'}

df['p_psn'].replace(perpos_dict, inplace = True)

df.p_psn.value_counts()

Driver                           2584453
Front row                         738678
Second row                        239629
Outside passenger compartment     211688
Third row                          28217
Position unknown                   14948
Name: p_psn, dtype: int64

## 9. Safety device used

In [25]:
pvt = df.pivot_table('c_case', index = 'p_safe', columns = 'fatal', aggfunc = 'count')
pvt['odd ratio'] = [np.around(pvt.values[0][0]/pvt.values[0][1],4),
                    np.around(pvt.values[1][0]/pvt.values[1][1],4),
                    np.around(pvt.values[2][0]/pvt.values[2][1],4),
                    np.around(pvt.values[3][0]/pvt.values[3][1],4),
                    np.around(pvt.values[4][0]/pvt.values[4][1],4),
                    np.around(pvt.values[5][0]/pvt.values[5][1],4)
                   ]
pvt

fatal,0,1,odd ratio
p_safe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Helmet worn,55904,2336,23.9315
No safety device equipped,49429,1631,30.3059
No safety device used,93219,11174,8.3425
Other safety device used,23241,677,34.3294
Reflective clothing worn,21,1,21.0
Safety device used,3534490,45490,77.6982


In [26]:
safe_dict = {'Safety device used': 'Safety device used', 
             'No safety device used': 'No safety device used', 
             'No safety device equipped': 'No safety device equipped',
            'Other safety device used': 'Other safety device used',
             'Helmet worn': 'Helmet worn', 
            'Reflective clothing worn' : 'Helmet worn',
            'Both helmet and reflective clothing used': 'Helmet worn'}

df['p_safe'].replace(safe_dict, inplace = True)

df.p_safe.value_counts()

Safety device used           3579980
No safety device used         104393
Helmet worn                    58262
No safety device equipped      51060
Other safety device used       23918
Name: p_safe, dtype: int64

# Save data as CSV

In [27]:
df.to_csv('../data/ml_data.csv', index = False)