# Data Cleaning

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Load  data
In each attribute

1. U, UU, UUU, UUUU represent unknown values
2. X, XX, XXX, XXXX represent jurisdiction does not provide this data element
3. Q, QQ, QQQ, QQQQ represent choice is other than the preceding values
4. N, NN, NNN, NNNN represent data element is not applicable.

In [2]:
df = pd.read_csv('../data/NCDB_1999_to_2017.csv') # Load raw data

df.columns = [x.lower() for x in df.columns] # Column names in lower case

df = df.rename(columns ={'c_sev':'class'}) # Rename the class variable

df.drop_duplicates(inplace = True) # Drop duplicate rows

print('Data size: {}'.format(df.shape))
df.head()

Data size: (6771768, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,1,1,20,2,2,34,UU,1,5,...,06,1990,1,M,41,11,1,UU,1,752
1,1999,1,1,20,2,2,34,UU,1,5,...,01,1987,1,M,19,11,1,UU,1,752
2,1999,1,1,20,2,2,34,UU,1,5,...,01,1987,2,F,20,13,2,02,2,752
3,1999,1,1,8,2,1,1,UU,5,3,...,01,1986,1,M,46,11,1,UU,1,753
4,1999,1,1,8,2,1,1,UU,5,3,...,NN,NNNN,1,M,5,99,2,UU,3,753


# Data types and missing values

There are only 3 numerical attributes out of 23 attributes. The data show no missing values because some of the attributes contain inconsistent formats like U, X, N, and Q as stated above. 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6771768 entries, 0 to 6772562
Data columns (total 23 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   c_year  int64 
 1   c_mnth  object
 2   c_wday  object
 3   c_hour  object
 4   class   int64 
 5   c_vehs  object
 6   c_conf  object
 7   c_rcfg  object
 8   c_wthr  object
 9   c_rsur  object
 10  c_raln  object
 11  c_traf  object
 12  v_id    object
 13  v_type  object
 14  v_year  object
 15  p_id    object
 16  p_sex   object
 17  p_age   object
 18  p_psn   object
 19  p_isev  object
 20  p_safe  object
 21  p_user  object
 22  c_case  int64 
dtypes: int64(3), object(20)
memory usage: 1.2+ GB


In [4]:
# Replace letters with numpy nan
letter_list = ['U', 'UU', 'UUU', 'UUUU', 'Q', 'QQ', 'QQQ', 'QQQQ', 
               'N', 'NN', 'NNN', 'NNNN', 'X', 'XX', 'XXX', 'XXXX']
df.replace(letter_list, np.nan, inplace = True)
df.head()

Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,1,1,20,2,2,34,,1,5,...,6.0,1990.0,1,M,41,11,1,,1,752
1,1999,1,1,20,2,2,34,,1,5,...,1.0,1987.0,1,M,19,11,1,,1,752
2,1999,1,1,20,2,2,34,,1,5,...,1.0,1987.0,2,F,20,13,2,2.0,2,752
3,1999,1,1,8,2,1,1,,5,3,...,1.0,1986.0,1,M,46,11,1,,1,753
4,1999,1,1,8,2,1,1,,5,3,...,,,1,M,5,99,2,,3,753


# Percentage of missing values

There are lots of missing values in some attributes. Since the dataset is large, we will drop rows with missing values during modeling

In [5]:
# Compute percentage of missing values
missing_values = (df.isnull().sum()/df.shape[0])*100

# Dataframe of missing values
missing_df =pd.DataFrame({'Variable': missing_values.index, '% missing values': missing_values.values})
missing_df.sort_values(by = '% missing values', ascending = False).reset_index(drop = True)

Unnamed: 0,Variable,% missing values
0,p_safe,21.18463
1,c_rcfg,10.663892
2,v_year,9.8895
3,c_conf,8.005487
4,c_raln,7.525612
5,p_age,6.751738
6,p_isev,6.403822
7,c_traf,5.312719
8,v_type,4.892061
9,p_sex,4.481134


#  Convert variables to the correct format
Based on the data information, there are many categorical variables encoded as numbers. In this section, we will convert those categorical variables to the correct description to better understand the data

# 1. Day of week

In [6]:
print("Originally encoded as: \n\n", df.c_wday.unique())

Originally encoded as: 

 [1 2 3 4 5 6 7 '7' nan '1' '2' '3' '4' '5' '6']


In [7]:
week_dict = {'1': 'Monday','2': 'Tuesday','3':'Wednesday','4':'Thursday',
             '5':'Friday', '6': 'Saturday', '7':'Sunday',1: 'Monday',2: 'Tuesday',3:'Wednesday',
             4:'Thursday',5:'Friday', 6: 'Saturday', 7:'Sunday'} 

df['c_wday'].replace(week_dict, inplace = True) 

In [8]:
df.c_wday.value_counts()

Friday       1153278
Thursday     1009002
Saturday      992996
Wednesday     957128
Tuesday       947852
Monday        898779
Sunday        811369
Name: c_wday, dtype: int64

# 2. Month of the year
The months are encoded as integers ranging from 1 to 12

In [9]:
print("Originally encoded as: \n\n", df.c_mnth.unique())

Originally encoded as: 

 [1 2 3 4 5 6 7 8 9 10 11 12 '12' nan '01' '02' '11']


In [10]:
month_dict = {1: 'January', '01': 'January', 2: 'February','02': 'February',3: 'March', 4: 'April',
             5:'May', 6: 'June', 7: 'July',8: 'August', 9: 'September',10: 'October', 11: 'November',
              '11': 'November', 12: 'December', '12': 'December'}

df['c_mnth'].replace(month_dict, inplace = True)

In [11]:
df.c_mnth.value_counts()

August       630095
July         622005
December     607222
June         600280
October      596261
September    593249
November     578431
January      575819
May          541550
February     492663
March        481073
April        452695
Name: c_mnth, dtype: int64

# 3. Collision hour

In [12]:
print("Originally encoded as: \n\n", df.c_hour.unique())

Originally encoded as: 

 ['20' '08' '17' '15' '14' '01' '11' '13' '19' '16' '09' '02' '18' '12'
 '10' '23' '00' '06' '07' '21' nan '05' '22' '03' '04']


In [13]:
# Convert to numeric
df['c_hour'] = pd.to_numeric(df['c_hour'], errors = 'coerce')

In [14]:
df.c_hour.dtypes

dtype('float64')

# 4. Number of vehicles involved in collision
The number of vehicles involved in an accident contains inconsistent numbering, e.g '01', '02', and so on. We  will correct this typos and convert the attribute to numeric

In [15]:
print("Originally encoded as: \n\n", df.c_vehs.unique())

Originally encoded as: 

 ['02' '01' '03' '04' '06' '07' '09' nan '05' '13' '08' '12' '14' '10' '11'
 '16' 1 2 3 4 5 6 8 7 12 10 '26' '71' 19 25 9 '21' '27' '15' '35' 15 13 26
 '22' '41' 27 46 31 11 '18' 18 56 '23' 36 17 14 20 21 '19' 16 '29' '25' 77
 28 38 '38' '17' '32' 35 22 33 '54' 72 '40' 44 58 '30' '77' '20' '24' '34'
 24 39 '51' 57 43 37 47]


In [16]:
df['c_vehs'].replace(['01','02','03','04','05','06','07','08','09'],[1,2,3,4,5,6,7,8,9], inplace = True)

In [17]:
# Convert to numeric
df['c_vehs'] = pd.to_numeric(df['c_vehs'],errors = 'coerce')

In [18]:
df.c_vehs.dtypes

dtype('float64')

# 5. Collision configuration

In [19]:
print("Originally encoded as: \n\n", df.c_conf.unique())

Originally encoded as: 

 ['34' '01' nan '04' '31' '21' '23' '03' '02' '33' '24' '35' '41' '06' '32'
 '36' '05' '22' '25']


In [20]:
coll_dict = {'01': 'Hit a moving object', '02': 'Hit a stationary object', '03': 'Ran off left shoulder',
                 '04': 'Ran off right shoulder', '05': 'Rollover on roadway', '06': 'Any other single-vehicle ',
                 '34': 'Right turn', '31': 'Head-on collision', '21': 'Rear-end collision', 
                  '23': 'left turn conflict', '33': 'Left turn across opposing traffic',
                 '24': 'right turn conflict', '35': 'Right angle collision', '41': 'Hit a parked motor vehicle',
                 '32': 'Approaching side-swipe', 
                  '36': 'Any other two-vehicle - different direction', '22': 'Side swipe', 
                  '25': 'Any other two-vehicle - same direction'}

df['c_conf'].replace(coll_dict, inplace = True)

In [21]:
df.c_conf.value_counts()

Rear-end collision                             2048187
Right angle collision                           994356
Any other single-vehicle                        596787
Any other two-vehicle - different direction     513153
Left turn across opposing traffic               487981
Ran off right shoulder                          245421
Side swipe                                      226414
Hit a stationary object                         221779
Head-on collision                               221672
Ran off left shoulder                           187179
Hit a moving object                             100895
Hit a parked motor vehicle                       92825
left turn conflict                               79584
Right turn                                       67118
right turn conflict                              60437
Approaching side-swipe                           56616
Rollover on roadway                              15335
Any other two-vehicle - same direction           13916
Name: c_co

# 6. Roadway configuration

In [22]:
print("Originally encoded as:\n\n", df.c_rcfg.unique())

Originally encoded as:

 [nan '01' '02' '03' '05' '04' '06' '08' '07' '09' '10']


In [23]:
roadway_dict = {'01': 'Non-intersection', '02':'At an intersection', 
                '03': 'Intersection with parking lot entrance', '04': 'Railroad level crossing',
               '05': 'Bridge, overpass, viaduct', '06': 'Tunnel or underpass', '07': 'Passing or climbing lane',
               '08': 'Ramp', '09': 'Traffic circle', '10': 'Express lane of a freeway system'}

df['c_rcfg'].replace(roadway_dict, inplace = True)

In [24]:
df.c_rcfg.value_counts()

At an intersection                        3164171
Non-intersection                          2450279
Intersection with parking lot entrance     323644
Bridge, overpass, viaduct                   59406
Railroad level crossing                     26325
Ramp                                        14372
Tunnel or underpass                          7019
Traffic circle                               2661
Passing or climbing lane                     1123
Express lane of a freeway system              634
Name: c_rcfg, dtype: int64

# 7. Weather condition

In [25]:
print("Originally encoded as:\n\n", df.c_wthr.unique())

Originally encoded as:

 ['1' '5' '3' '4' '7' '2' nan '6']


In [26]:
wthr_dict = {'1': 'Clear and sunny', '2': 'Overcast, cloudy but no precipitation', '3': 'Raining',
            '4': 'Snowing', '5': 'Freezing rain, sleet, hail', '6': 'Visibility limitation', '7': 'Strong wind'}

df['c_wthr'].replace(wthr_dict, inplace = True)

In [27]:
df.c_wthr.value_counts()

Clear and sunny                          4733176
Overcast, cloudy but no precipitation     685980
Raining                                   682240
Snowing                                   406046
Visibility limitation                      95945
Freezing rain, sleet, hail                 34849
Strong wind                                17809
Name: c_wthr, dtype: int64

# 8. Road surface

In [28]:
print("Originally encoded as:\n\n", df.c_rsur.unique())

Originally encoded as:

 ['5' '3' '2' '4' '1' '6' nan '7' '9' '8']


In [29]:
roadsur_dict = {'1':'Dry, normal', '2': 'Wet', '3': 'Snow', '4':'Slush', '5': 'Icy', '6':'Sand/gravel/dirt',
               '7':'Muddy', '8': 'Oil', '9': 'Flooded'}

df['c_rsur'].replace(roadsur_dict, inplace = True)

In [30]:
df.c_rsur.value_counts()

Dry, normal         4472490
Wet                 1230478
Icy                  371459
Snow                 296579
Slush                 84905
Sand/gravel/dirt      29044
Muddy                  6623
Oil                    1372
Flooded                 432
Name: c_rsur, dtype: int64

# 9. Road alignment

In [31]:
print("Originally encoded as:\n\n", df.c_raln.unique())

Originally encoded as:

 ['3' '6' '1' nan '2' '5' '4']


In [32]:
roadall_dist = {'1':'Straight and level', '2':'Straight with gradient', '3': 'Curved and level',
               '4':'Curved with gradient', '5': 'Top of hill or gradient', '6': 'Bottom of hill or gradient'}

df['c_raln'].replace(roadall_dist, inplace = True)

In [33]:
df.c_raln.value_counts()

Straight and level            4846725
Straight with gradient         663241
Curved and level               415574
Curved with gradient           252579
Top of hill or gradient         48383
Bottom of hill or gradient      35649
Name: c_raln, dtype: int64

# 10. Traffic control

In [34]:
print("Originally encoded as:\n\n", df.c_traf.unique())

Originally encoded as:

 ['03' '18' '01' nan '06' '10' '05' '04' '11' '07' '08' '16' '17' '02' '13'
 '15' '09' '12']


In [35]:
tracon_dict ={'01': 'Traffic signals fully operational', '02': 'Traffic signals in flashing mode',
             '03': 'Stop sign', '04': 'Yield sign', '05':'Warning sign', '06':'Pedestrian crosswalk',
             '07': 'Police officer', '08': 'School guard, flagman', '09': 'School crossing',
              '10': 'Reduced speed zone', '11': 'No passing zone sign', '12': 'Markings on the road',
             '13': 'School bus stopped with school bus signal lights flashing',
             '15':'Railway crossing with signals, or signals and gates',
             '16': 'Railway crossing with signs only',
             '17': 'Control device not specified', '18': 'No control present'}

df['c_traf'].replace(tracon_dict, inplace = True)

In [36]:
df.c_traf.value_counts()

No control present                                           3517741
Traffic signals fully operational                            1896040
Stop sign                                                     755000
Yield sign                                                    105980
Pedestrian crosswalk                                           52645
School guard, flagman                                          28071
Traffic signals in flashing mode                               22208
School bus stopped with school bus signal lights flashing       4929
Control device not specified                                    4572
Railway crossing with signals, or signals and gates             4451
Markings on the road                                            4389
No passing zone sign                                            3862
Reduced speed zone                                              3160
Police officer                                                  2655
Railway crossing with signs only  

# 11. Vehicle sequence number
There is a mismatch in the vehicle sequence number, e.g. '01' is 1, '02' is 2 and so on

In [37]:
print("Originally encoded as: \n\n", df.v_id.unique())

Originally encoded as: 

 ['01' '02' '99' '03' '04' nan '05' '06' '07' '08' '09' '10' '11' '12' '13'
 '14' '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27'
 '28' '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41'
 '42' '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55'
 '56' '57' '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69'
 '70' '71' 2 1 99 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 '72' '73' '74' '83'
 '85' '86' '75' '76' '77' 42 43 44 45 46 47 48 49 51 52 50 53 54 55 56 57]


In [38]:
df['v_id'].replace(['01','02','03','04','05','06', '07','08','09'],
                               [1,2,3,4,5,6,7,8,9], inplace = True)

In [39]:
# Convert to numeric
df['v_id'] = pd.to_numeric(df['v_id'], errors ='coerce')

# 12. Vehicle type

In [40]:
print("Originally encoded as: \n\n", df.v_type.unique())

Originally encoded as: 

 ['06' '01' nan '11' '20' '17' '07' '08' '09' '22' '14' '23' '05' '16' '19'
 '18' '10' '21']


In [41]:
vehtype_dict ={'01': 'Light Duty Vehicle', '06': 'Other trucks and vans', '11': 'Urban and Intercity Bus',
              '20': 'Construction equipment', '17': 'Bicycle', '07': 'Unit trucks', '08': 'Road tractor',
              '09': 'School bus', '22': 'Snowmobile', '14': 'Motorcycle and moped', '23': 'Street car',
              '05': 'Panel/cargo van', '16': 'Off road vehicles', 
               '19': 'Farm equipment', '18': 'Purpose-built motorhome',
              '10': 'Smaller school bus', '21': 'Fire engine'}

df['v_type'].replace(vehtype_dict, inplace = True)

In [42]:
df.v_type.value_counts()

Light Duty Vehicle         5581069
Other trucks and vans       193087
Motorcycle and moped        144140
Bicycle                     141776
Unit trucks                 101481
Road tractor                 81219
Panel/cargo van              75875
Urban and Intercity Bus      59947
School bus                   25546
Off road vehicles            11135
Construction equipment        8807
Snowmobile                    5057
Purpose-built motorhome       3540
Street car                    3451
Farm equipment                2323
Smaller school bus            1087
Fire engine                    949
Name: v_type, dtype: int64

# 13. Vehicle model year

In [43]:
print("Originally encoded as: \n\n", df.v_year.unique())

Originally encoded as: 

 ['1990' '1987' '1986' nan '1984' '1991' '1992' '1997' '1993' '1985' '1988'
 '1994' '1995' '1998' '1989' '1996' '1983' '1999' '1965' '1977' '1978'
 '1968' '1981' '1979' '1976' '1972' '2000' '1982' '1975' '1973' '1974'
 '1980' '1967' '1970' '1971' '1962' '1969' '1966' '1945' '1963' '1960'
 '1950' '1964' '1959' '1955' '1958' '1903' '1909' '1949' '1923' '1961'
 '1914' '1908' '1953' '1906' '1939' '1925' '1948' '1938' '1907' '1952'
 '1904' '1917' '1912' '1944' '1956' '1930' '1931' '1951' '1946' '1947'
 '1957' '1954' '1943' '1901' '1937' '1905' '1935' '1926' '1941' '1932'
 '1920' '1933' '1919' '1915' '1929' '1928' '2001' '1913' '1940' '1927'
 '2002' '1916' '1942' '1918' '2003' '1924' '1922' '1934' '2004' '2005'
 '2006' '2007' '2008' '1911' '2009' '2010' '2011' '1936' '2012' '1910'
 '1921' '2013' '2014' '2015' '2016' '2017' '2018']


In [44]:
# Convert to numeric
df['v_year'] = pd.to_numeric(df['v_year'], errors = 'coerce')

In [45]:
df['v_year'].value_counts().head()

2000.0    341453
2002.0    323149
2003.0    317580
1998.0    306083
2001.0    300347
Name: v_year, dtype: int64

# 14. Person sequence ID

In [46]:
print("Originally encoded as: \n\n", df.p_id.unique())

Originally encoded as: 

 ['01' '02' '03' '04' '05' '06' nan '07' '08' '09' '10' '11' '12' '13' '14'
 '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28'
 '29' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41' '42'
 '43' '44' '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55' '56'
 '57' '58' '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69' '70'
 '71' '72' '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83' '84'
 '85' '86' '87' '88' '89' '90' '91' '92' '93' '94' '95' '99']


In [47]:
df['p_id'].replace(['01', '02','03','04','05','06','07','08','09'],
                               [1,2,3,4,5,6,7,8,9], inplace = True)

In [48]:
# Convert to numeric
df['p_id'] = pd.to_numeric(df['p_id'], errors = 'coerce')

In [49]:
df.p_id.dtypes

dtype('float64')

# 15. Gender
More males are involved in vehicle collision than females

In [50]:
print("Originally encoded as: \n\n", df.p_sex.unique())

Originally encoded as: 

 ['M' 'F' nan]


# 16. Person age

In [51]:
print("Originally encoded as: \n\n", df.p_age.unique())

Originally encoded as: 

 ['41' '19' '20' '46' '05' '28' '21' nan '61' '56' '34' '22' '30' '49' '32'
 '31' '68' '08' '45' '17' '33' '82' '39' '37' '55' '38' '43' '35' '23'
 '25' '65' '44' '36' '70' '50' '40' '27' '26' '15' '53' '16' '13' '14'
 '12' '18' '77' '86' '42' '24' '47' '62' '06' '57' '83' '74' '67' '51'
 '29' '01' '02' '54' '71' '10' '79' '63' '58' '48' '60' '07' '64' '75'
 '52' '85' '93' '92' '69' '72' '11' '59' '09' '66' '76' '73' '04' '78'
 '80' '84' '03' '81' '89' '87' '88' '90' '91' '95' '97' '94' '99' '98'
 '96']


In [52]:
df['p_age'].replace(['01','02','03','04', '05', '06','07','08','09'],
                                 [1,2,3,4,5,6,7,8,9], inplace = True)

In [53]:
# Convert to numeric
df['p_age'] = pd.to_numeric(df['p_age'], errors = 'coerce')

In [54]:
df.p_age.dtypes

dtype('float64')

# 17. Person position

In [55]:
print("Originally encoded as: \n\n", df.p_psn.unique())

Originally encoded as: 

 ['11' '13' '99' '23' '98' '21' '22' '12' nan '96' '32' '31' '33' '97']


In [56]:
perpos_dict = {'11': 'Driver', '13': 'Front row, right outboard',
               '99': 'Pedestrian', '23': 'Second row, right outboard',
              '98': 'Outside passenger compartment', '21': 'Second row, left outboard',
              '22': 'Second row, center', '12': 'Front row, center', '96': 'Position unknown',
              '32': 'Third row, center', '31': 'Third row, left outboard',
              '33': 'Third row, right outboard', '97': 'Sitting on someone’s lap'}

df['p_psn'].replace(perpos_dict, inplace = True)

In [57]:
df.p_psn.value_counts()

Driver                           4544987
Front row, right outboard        1001423
Second row, right outboard        295297
Second row, left outboard         247300
Pedestrian                        244407
Second row, center                100112
Front row, center                  97032
Position unknown                   52810
Third row, center                  33461
Outside passenger compartment       7376
Third row, right outboard           7039
Third row, left outboard            5536
Sitting on someone’s lap             298
Name: p_psn, dtype: int64

# 18. Medical treatment required

In [58]:
print("Originally encoded as: \n\n", df.p_isev.unique())

Originally encoded as: 

 ['1' '2' '3' nan]


In [59]:
med_dict = {'1': 'No Injury', '2': 'Injury', '3': 'Fatality'}

df['p_isev'].replace(med_dict,inplace = True)

In [60]:
df.p_isev.value_counts()

Injury       3547554
No Injury    2744546
Fatality       46016
Name: p_isev, dtype: int64

# 19. Safety device used

In [61]:
print("Originally encoded as: \n\n", df.p_safe.unique())

Originally encoded as: 

 [nan '02' '01' '13' '12' '09' '10' '11']


In [62]:
safe_dict = {'02': 'Safety device used', '01': 'No safety device used', '13': 'No safety device equipped',
            '12': 'Other safety device used', '09': 'Helmet worn', '10': 'Reflective clothing worn',
            '11': 'Both helmet and reflective clothing used'}

df['p_safe'].replace(safe_dict, inplace = True)

In [63]:
df.p_safe.value_counts()

Safety device used                          4806072
No safety device used                        221621
No safety device equipped                    159316
Helmet worn                                  120101
Other safety device used                      29803
Reflective clothing worn                        267
Both helmet and reflective clothing used         14
Name: p_safe, dtype: int64

# 20. Road user class

In [64]:
print("Originally encoded as: \n\n", df.p_user.unique())

Originally encoded as: 

 ['1' '2' '3' nan '4' '5']


In [65]:
roaduser_dict = {'1': 'Motor Vehicle Driver', '2': 'Motor Vehicle Passenger', '3': 'Pedestrian',
                '4': 'Bicyclist', '5': 'Motorcyclist'}

df['p_user'].replace(roaduser_dict, inplace = True)

In [66]:
df.p_user.value_counts()

Motor Vehicle Driver       4234272
Motor Vehicle Passenger    1772482
Pedestrian                  258189
Motorcyclist                144140
Bicyclist                   141776
Name: p_user, dtype: int64

# Class distribution
The class variable is encoded as integers 1 and 2, where 1 represents fatal accidents and 2 represents non-fatal accidents. It is useful to change the class variable as a binary class, by replacing 2 with 0.

In [67]:
df['class'].unique() # Originally encoded

array([2, 1])

In [68]:
df['class'].replace(2, 0, inplace = True) 

# Percentage of the class distribution
- This is an extremely imbalanced class problem, where the class of interest (in this case fatal accidents) is extremely small compared to the other class 

- The majority (negative) class contains 98% of the class distribution, whereas the minority (positive) class is only 2% of the class distribution. 

In [69]:
print('Percentage of males and females involved in collions')
print(df['class'].value_counts(normalize=True)*100)

Percentage of males and females involved in collions
0    98.33906
1     1.66094
Name: class, dtype: float64


# Save as csv dataframe

In [70]:
df.to_csv('../data/clean_data.csv', index= False)