# Pre-Processing and Cleaning of Data

In [147]:
import pandas as pd
from pathlib import Path

In [148]:
dir_path = Path().cwd().parent
raw_data_path = Path('data_files/raw_data')

In [149]:
crash_data = pd.read_csv(str(dir_path / raw_data_path /  Path("Motor_Vehicle_Collisions_-_Crashes.csv")), low_memory=False)
person_data = pd.read_csv(str(dir_path / raw_data_path / Path("Motor_Vehicle_Collisions_-_Person.csv")), low_memory=False)

In [150]:
print("Crash Data:")
display(crash_data.head())

Crash Data:


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


In [151]:
# display datatypes and range
print("\nCrash Data Info:")
crash_data.info()


Crash Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125763 entries, 0 to 2125762
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED   

In [152]:
crash_data.columns = crash_data.columns.str.replace(" ", "_")
display(crash_data.columns)

Index(['CRASH_DATE', 'CRASH_TIME', 'BOROUGH', 'ZIP_CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON_STREET_NAME', 'CROSS_STREET_NAME',
       'OFF_STREET_NAME', 'NUMBER_OF_PERSONS_INJURED',
       'NUMBER_OF_PERSONS_KILLED', 'NUMBER_OF_PEDESTRIANS_INJURED',
       'NUMBER_OF_PEDESTRIANS_KILLED', 'NUMBER_OF_CYCLIST_INJURED',
       'NUMBER_OF_CYCLIST_KILLED', 'NUMBER_OF_MOTORIST_INJURED',
       'NUMBER_OF_MOTORIST_KILLED', 'CONTRIBUTING_FACTOR_VEHICLE_1',
       'CONTRIBUTING_FACTOR_VEHICLE_2', 'CONTRIBUTING_FACTOR_VEHICLE_3',
       'CONTRIBUTING_FACTOR_VEHICLE_4', 'CONTRIBUTING_FACTOR_VEHICLE_5',
       'COLLISION_ID', 'VEHICLE_TYPE_CODE_1', 'VEHICLE_TYPE_CODE_2',
       'VEHICLE_TYPE_CODE_3', 'VEHICLE_TYPE_CODE_4', 'VEHICLE_TYPE_CODE_5'],
      dtype='object')

### Cleaning the Location related Columns

In [153]:
# Sub-setting the crash DataFrame for only the columns I need
location_df = crash_data[['COLLISION_ID', 'CRASH_DATE', 'CRASH_TIME', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'NUMBER_OF_PERSONS_INJURED',
       'NUMBER_OF_PERSONS_KILLED', 'CONTRIBUTING_FACTOR_VEHICLE_1',
       'CONTRIBUTING_FACTOR_VEHICLE_2']]

In [154]:
display(location_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125763 entries, 0 to 2125762
Data columns (total 10 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   COLLISION_ID                   int64  
 1   CRASH_DATE                     object 
 2   CRASH_TIME                     object 
 3   LATITUDE                       float64
 4   LONGITUDE                      float64
 5   LOCATION                       object 
 6   NUMBER_OF_PERSONS_INJURED      float64
 7   NUMBER_OF_PERSONS_KILLED       float64
 8   CONTRIBUTING_FACTOR_VEHICLE_1  object 
 9   CONTRIBUTING_FACTOR_VEHICLE_2  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 162.2+ MB


None

In [155]:
# Check for missing values 
print("Missing values in Location Data:")
print(location_df.isnull().sum())

Missing values in Location Data:
COLLISION_ID                          0
CRASH_DATE                            0
CRASH_TIME                            0
LATITUDE                         252597
LONGITUDE                        252597
LOCATION                         252597
NUMBER_OF_PERSONS_INJURED            18
NUMBER_OF_PERSONS_KILLED             31
CONTRIBUTING_FACTOR_VEHICLE_1      7144
CONTRIBUTING_FACTOR_VEHICLE_2    333073
dtype: int64


In [156]:
def create_accident_severity(row):
    if row['NUMBER_OF_PERSONS_KILLED'] > 0:
        return 'Fatal'
    elif row['NUMBER_OF_PERSONS_INJURED'] >= 3:
        return 'Major Injury'
    elif row['NUMBER_OF_PERSONS_INJURED'] > 0:
        return 'Minor Injury'
    else:
        return 'No Injury'

In [157]:
# Creating Encoding Data
location_df['ACCIDENT_SEVERITY'] = location_df.apply(create_accident_severity, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df['ACCIDENT_SEVERITY'] = location_df.apply(create_accident_severity, axis=1)


In [158]:
# Drop all the rows where location is missing
location_df.dropna(subset=['LOCATION'], how='all', inplace=True)
location_df = location_df.query('LOCATION != (0.0, 0.0)').copy()
location_df.drop(columns='LOCATION', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df.dropna(subset=['LOCATION'], how='all', inplace=True)


In [159]:
location_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY
3,4456314,09/11/2021,9:35,40.667202,-73.8665,0.0,0.0,Unspecified,,No Injury
4,4486609,12/14/2021,8:13,40.683304,-73.917274,0.0,0.0,,,No Injury
6,4486555,12/14/2021,17:05,40.709183,-73.956825,0.0,0.0,Passing Too Closely,Unspecified,No Injury
7,4486660,12/14/2021,8:17,40.86816,-73.83148,2.0,0.0,Unspecified,Unspecified,Minor Injury
8,4487074,12/14/2021,21:10,40.67172,-73.8971,0.0,0.0,Driver Inexperience,Unspecified,No Injury


In [160]:
# Cleaning of Contributing Factors
location_df = location_df[location_df[['CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2']].notnull().all(axis=1)].copy()

location_df = location_df.query(
"(CONTRIBUTING_FACTOR_VEHICLE_1 != 'Unspecified' | CONTRIBUTING_FACTOR_VEHICLE_2 != 'Unspecified') & "
"(CONTRIBUTING_FACTOR_VEHICLE_1 != '80' & CONTRIBUTING_FACTOR_VEHICLE_2 != '80') & (CONTRIBUTING_FACTOR_VEHICLE_1 != '1' & CONTRIBUTING_FACTOR_VEHICLE_2 != '1')").copy()

rename_dict ={
    'Cell Phone (hand-held)': 'Cell Phone (Hand-Held)',
    'Cell Phone (hand-Held)': 'Cell Phone (Hand-Held)',
    'Cell Phone (hands-free)': 'Cell Phone (Hands-Free)',
    'Drugs (illegal)': 'Drugs (Illegal)', 
    'Illnes': 'Illness'
}
location_df['CONTRIBUTING_FACTOR_VEHICLE_1'] = location_df['CONTRIBUTING_FACTOR_VEHICLE_1'].replace(rename_dict).copy()
location_df['CONTRIBUTING_FACTOR_VEHICLE_2'] = location_df['CONTRIBUTING_FACTOR_VEHICLE_2'].replace(rename_dict).copy()

In [161]:
# fill nulls with 'UNKNOWN' or 0 
location_df.fillna({
    "NUMBER_OF_PERSONS_INJURED":  0.0,
    "NUMBER_OF_PERSONS_KILLED":  0.0,
}, inplace=True)

print("Location Data after cleaning:")
print(location_df.isnull().sum())

Location Data after cleaning:
COLLISION_ID                     0
CRASH_DATE                       0
CRASH_TIME                       0
LATITUDE                         0
LONGITUDE                        0
NUMBER_OF_PERSONS_INJURED        0
NUMBER_OF_PERSONS_KILLED         0
CONTRIBUTING_FACTOR_VEHICLE_1    0
CONTRIBUTING_FACTOR_VEHICLE_2    0
ACCIDENT_SEVERITY                0
dtype: int64


In [162]:
location_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY
6,4486555,12/14/2021,17:05,40.709183,-73.956825,0.0,0.0,Passing Too Closely,Unspecified,No Injury
8,4487074,12/14/2021,21:10,40.67172,-73.8971,0.0,0.0,Driver Inexperience,Unspecified,No Injury
9,4486519,12/14/2021,14:58,40.75144,-73.97397,0.0,0.0,Passing Too Closely,Unspecified,No Injury
10,4486934,12/13/2021,0:34,40.701275,-73.88887,0.0,0.0,Passing or Lane Usage Improper,Unspecified,No Injury
11,4487127,12/14/2021,16:50,40.675884,-73.75577,0.0,0.0,Turning Improperly,Unspecified,No Injury


In [163]:
import datetime as dt
# Handle Types
location_df['CRASH_DATE'] = pd.to_datetime(location_df['CRASH_DATE'])
location_df['CRASH_TIME'] = pd.to_datetime(location_df['CRASH_TIME'], format='%H:%M').dt.time

location_df = location_df.astype({
        'NUMBER_OF_PERSONS_INJURED': 'int', 
        'NUMBER_OF_PERSONS_KILLED': 'int',
        })

In [164]:
location_df = location_df.query("(40.5774 <= LATITUDE <= 45.01585) & (-74.2591 <= LONGITUDE <= -73.7004)").copy()

In [165]:
location_df.reset_index(drop=True, inplace=True)
location_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY
0,4486555,2021-12-14,17:05:00,40.709183,-73.956825,0,0,Passing Too Closely,Unspecified,No Injury
1,4487074,2021-12-14,21:10:00,40.67172,-73.8971,0,0,Driver Inexperience,Unspecified,No Injury
2,4486519,2021-12-14,14:58:00,40.75144,-73.97397,0,0,Passing Too Closely,Unspecified,No Injury
3,4486934,2021-12-13,00:34:00,40.701275,-73.88887,0,0,Passing or Lane Usage Improper,Unspecified,No Injury
4,4487127,2021-12-14,16:50:00,40.675884,-73.75577,0,0,Turning Improperly,Unspecified,No Injury


In [166]:
location_df.to_csv(str(dir_path / Path('data_files/clean_data/location_data.csv')), index=False)

In [167]:
person_data.head()

Unnamed: 0,UNIQUE_ID,COLLISION_ID,CRASH_DATE,CRASH_TIME,PERSON_ID,PERSON_TYPE,PERSON_INJURY,VEHICLE_ID,PERSON_AGE,EJECTION,...,BODILY_INJURY,POSITION_IN_VEHICLE,SAFETY_EQUIPMENT,PED_LOCATION,PED_ACTION,COMPLAINT,PED_ROLE,CONTRIBUTING_FACTOR_1,CONTRIBUTING_FACTOR_2,PERSON_SEX
0,10249006,4229554,10/26/2019,9:43,31aa2bc0-f545-444f-8cdb-f1cb5cf00b89,Occupant,Unspecified,19141108.0,,,...,,,,,,,Registrant,,,U
1,10255054,4230587,10/25/2019,15:15,4629e500-a73e-48dc-b8fb-53124d124b80,Occupant,Unspecified,19144075.0,33.0,Not Ejected,...,Does Not Apply,"Front passenger, if two or more persons, inclu...",Lap Belt & Harness,,,Does Not Apply,Passenger,,,F
2,10253177,4230550,10/26/2019,17:55,ae48c136-1383-45db-83f4-2a5eecfb7cff,Occupant,Unspecified,19143133.0,55.0,,...,,,,,,,Registrant,,,M
3,6650180,3565527,11/21/2016,13:05,2782525,Occupant,Unspecified,,,,...,,,,,,,Notified Person,,,
4,10255516,4231168,10/25/2019,11:16,e038e18f-40fb-4471-99cf-345eae36e064,Occupant,Unspecified,19144329.0,7.0,Not Ejected,...,Does Not Apply,Right rear passenger or motorcycle sidecar pas...,Lap Belt,,,Does Not Apply,Passenger,,,F


In [168]:
# Gather info about People involved in accident
person_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5495040 entries, 0 to 5495039
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   UNIQUE_ID              int64  
 1   COLLISION_ID           int64  
 2   CRASH_DATE             object 
 3   CRASH_TIME             object 
 4   PERSON_ID              object 
 5   PERSON_TYPE            object 
 6   PERSON_INJURY          object 
 7   VEHICLE_ID             float64
 8   PERSON_AGE             float64
 9   EJECTION               object 
 10  EMOTIONAL_STATUS       object 
 11  BODILY_INJURY          object 
 12  POSITION_IN_VEHICLE    object 
 13  SAFETY_EQUIPMENT       object 
 14  PED_LOCATION           object 
 15  PED_ACTION             object 
 16  COMPLAINT              object 
 17  PED_ROLE               object 
 18  CONTRIBUTING_FACTOR_1  object 
 19  CONTRIBUTING_FACTOR_2  object 
 20  PERSON_SEX             object 
dtypes: float64(2), int64(2), object(17)
memory usage: 880.

In [169]:
# Identify which columns are null
person_data.isnull().sum()

UNIQUE_ID                      0
COLLISION_ID                   0
CRASH_DATE                     0
CRASH_TIME                     0
PERSON_ID                     19
PERSON_TYPE                    0
PERSON_INJURY                  0
VEHICLE_ID                224115
PERSON_AGE                597119
EJECTION                 2673321
EMOTIONAL_STATUS         2587450
BODILY_INJURY            2587407
POSITION_IN_VEHICLE      2672929
SAFETY_EQUIPMENT         2855212
PED_LOCATION             5404189
PED_ACTION               5404290
COMPLAINT                2587400
PED_ROLE                  194889
CONTRIBUTING_FACTOR_1    5405540
CONTRIBUTING_FACTOR_2    5405657
PERSON_SEX                613524
dtype: int64

In [170]:
# Subsetting Data to focus on relevant columns
person_detail_df = person_data[['COLLISION_ID', 'CRASH_DATE', 'CRASH_TIME', 'PERSON_AGE', 'PERSON_TYPE', 'PERSON_SEX', 'PERSON_INJURY', 'EMOTIONAL_STATUS', 'BODILY_INJURY']]

person_detail_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,PERSON_AGE,PERSON_TYPE,PERSON_SEX,PERSON_INJURY,EMOTIONAL_STATUS,BODILY_INJURY
0,4229554,10/26/2019,9:43,,Occupant,U,Unspecified,,
1,4230587,10/25/2019,15:15,33.0,Occupant,F,Unspecified,Does Not Apply,Does Not Apply
2,4230550,10/26/2019,17:55,55.0,Occupant,M,Unspecified,,
3,3565527,11/21/2016,13:05,,Occupant,,Unspecified,,
4,4231168,10/25/2019,11:16,7.0,Occupant,F,Unspecified,Does Not Apply,Does Not Apply


In [171]:
# Drop the null Values
person_detail_df = person_detail_df.dropna(subset=['PERSON_AGE', 'PERSON_SEX'])
person_detail_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,PERSON_AGE,PERSON_TYPE,PERSON_SEX,PERSON_INJURY,EMOTIONAL_STATUS,BODILY_INJURY
1,4230587,10/25/2019,15:15,33.0,Occupant,F,Unspecified,Does Not Apply,Does Not Apply
2,4230550,10/26/2019,17:55,55.0,Occupant,M,Unspecified,,
4,4231168,10/25/2019,11:16,7.0,Occupant,F,Unspecified,Does Not Apply,Does Not Apply
5,4230743,10/24/2019,19:15,27.0,Occupant,M,Injured,Conscious,Back
6,4230047,10/26/2019,16:45,41.0,Occupant,F,Unspecified,,


In [172]:
# Correct the Data Types
print('Before the conversion: ')
display(person_detail_df.info())

person_detail_df['CRASH_DATE'] = pd.to_datetime(person_detail_df['CRASH_DATE'])
person_detail_df['CRASH_TIME'] = pd.to_datetime(person_detail_df['CRASH_TIME'], format='%H:%M').dt.time
person_detail_df['PERSON_AGE'] = person_detail_df['PERSON_AGE'].astype(int)

print('After the conversion: ')
display(person_detail_df.info())

person_detail_df.head()

Before the conversion: 
<class 'pandas.core.frame.DataFrame'>
Index: 4566749 entries, 1 to 5495039
Data columns (total 9 columns):
 #   Column            Dtype  
---  ------            -----  
 0   COLLISION_ID      int64  
 1   CRASH_DATE        object 
 2   CRASH_TIME        object 
 3   PERSON_AGE        float64
 4   PERSON_TYPE       object 
 5   PERSON_SEX        object 
 6   PERSON_INJURY     object 
 7   EMOTIONAL_STATUS  object 
 8   BODILY_INJURY     object 
dtypes: float64(1), int64(1), object(7)
memory usage: 348.4+ MB


None

After the conversion: 
<class 'pandas.core.frame.DataFrame'>
Index: 4566749 entries, 1 to 5495039
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   COLLISION_ID      int64         
 1   CRASH_DATE        datetime64[ns]
 2   CRASH_TIME        object        
 3   PERSON_AGE        int64         
 4   PERSON_TYPE       object        
 5   PERSON_SEX        object        
 6   PERSON_INJURY     object        
 7   EMOTIONAL_STATUS  object        
 8   BODILY_INJURY     object        
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 348.4+ MB


None

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,PERSON_AGE,PERSON_TYPE,PERSON_SEX,PERSON_INJURY,EMOTIONAL_STATUS,BODILY_INJURY
1,4230587,2019-10-25,15:15:00,33,Occupant,F,Unspecified,Does Not Apply,Does Not Apply
2,4230550,2019-10-26,17:55:00,55,Occupant,M,Unspecified,,
4,4231168,2019-10-25,11:16:00,7,Occupant,F,Unspecified,Does Not Apply,Does Not Apply
5,4230743,2019-10-24,19:15:00,27,Occupant,M,Injured,Conscious,Back
6,4230047,2019-10-26,16:45:00,41,Occupant,F,Unspecified,,


In [173]:
display(person_detail_df['PERSON_INJURY'].value_counts())

person_detail_df = person_detail_df.query("PERSON_INJURY != 'Unspecified'")

display(person_detail_df)

PERSON_INJURY
Unspecified    4089653
Injured         474855
Killed            2241
Name: count, dtype: int64

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,PERSON_AGE,PERSON_TYPE,PERSON_SEX,PERSON_INJURY,EMOTIONAL_STATUS,BODILY_INJURY
5,4230743,2019-10-24,19:15:00,27,Occupant,M,Injured,Conscious,Back
7,4229547,2019-10-26,01:15:00,24,Pedestrian,F,Injured,Conscious,Shoulder - Upper Arm
18,4230715,2019-10-26,08:50:00,42,Bicyclist,M,Injured,Unknown,Knee-Lower Leg Foot
31,4230376,2019-10-26,19:40:00,36,Bicyclist,M,Injured,Conscious,Back
44,4229773,2019-10-26,16:50:00,50,Occupant,F,Injured,Conscious,Head
...,...,...,...,...,...,...,...,...,...
5495010,4762077,2024-10-08,12:18:00,18,Pedestrian,M,Injured,Conscious,Elbow-Lower-Arm-Hand
5495012,4762024,2024-10-08,08:20:00,52,Bicyclist,M,Injured,Conscious,Back
5495023,4762519,2024-10-08,21:15:00,52,Other Motorized,F,Injured,Conscious,Head
5495026,4762145,2024-10-08,07:35:00,42,Other Motorized,F,Injured,Conscious,Neck


In [174]:
# Fixing the Emotional Status Column

person_detail_df['EMOTIONAL_STATUS'] = person_detail_df['EMOTIONAL_STATUS'].replace({'Does Not Apply': 'Unknown'})

display(person_detail_df['EMOTIONAL_STATUS'].value_counts())


EMOTIONAL_STATUS
Conscious         439708
Unknown            14528
Shock              14018
Semiconscious       2794
Unconscious         2602
Incoherent          1805
Apparent Death      1558
Name: count, dtype: int64

In [175]:
# Change the Person Sex Column

person_detail_df['PERSON_SEX'] = person_detail_df['PERSON_SEX'].replace({
    'M': 'Male',
    'F': 'Female',
    'U': 'Unknown'
})

display(person_detail_df['PERSON_SEX'].value_counts())

PERSON_SEX
Male       275435
Female     200861
Unknown       800
Name: count, dtype: int64

In [176]:
# Drop the columns where the bodily injury is not described
person_detail_df = person_detail_df.query("BODILY_INJURY != 'Unknown'")

person_detail_df['BODILY_INJURY'].value_counts()

BODILY_INJURY
Back                    79882
Neck                    76761
Knee-Lower Leg Foot     73422
Head                    65906
Entire Body             39025
Elbow-Lower-Arm-Hand    32950
Shoulder - Upper Arm    32899
Chest                   17475
Hip-Upper Leg           16922
Face                    12831
Abdomen - Pelvis         8413
Eye                       908
Does Not Apply            636
Name: count, dtype: int64

In [177]:
person_detail_df.value_counts()

COLLISION_ID  CRASH_DATE  CRASH_TIME  PERSON_AGE  PERSON_TYPE  PERSON_SEX  PERSON_INJURY  EMOTIONAL_STATUS  BODILY_INJURY      
4148045       2019-06-07  09:50:00    7           Occupant     Female      Injured        Conscious         Head                   6
4689227       2023-12-21  11:00:00    10          Occupant     Female      Injured        Conscious         Head                   6
3678545       2017-05-26  10:00:00    11          Occupant     Female      Injured        Conscious         Entire Body            6
4148045       2019-06-07  09:50:00    8           Occupant     Female      Injured        Conscious         Head                   5
4552186       2022-07-21  05:56:00    55          Occupant     Female      Injured        Conscious         Entire Body            5
                                                                                                                                  ..
4016320       2018-11-07  18:14:00    41          Pedestrian   Female     

In [178]:
# Save to a csv file
person_detail_df.to_csv(str(dir_path / Path('data_files/clean_data/person_detail.csv')), index=False)