In [1]:
#Imports
import pandas as pd

In [None]:
# Load the csv file
people_df = pd.read_csv('../data/People.csv')


In [None]:
#Check if there are missing values and how many
print('Missing values per column:')
print(people_df.isna().sum())

# Print columns with missing values
columns_with_missing_values = people_df.columns[people_df.isnull().any()].tolist()
print('\nColumns with missing values:')
print(columns_with_missing_values)

Missing values per column:
PERSON_ID                     0
PERSON_TYPE                   0
RD_NO                         0
VEHICLE_ID                10696
CRASH_DATE                    0
CITY                     143920
STATE                    141976
SEX                        7154
AGE                      160981
SAFETY_EQUIPMENT           1410
AIRBAG_DEPLOYED           10452
EJECTION                   6705
INJURY_CLASSIFICATION       296
DRIVER_ACTION            109766
DRIVER_VISION            109884
PHYSICAL_CONDITION       109501
BAC_RESULT               108833
DAMAGE_CATEGORY               0
DAMAGE                    74309
dtype: int64

Columns with missing values:
['VEHICLE_ID', 'CITY', 'STATE', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'BAC_RESULT', 'DAMAGE']


In [4]:
print('Missing values per column:')
people_df_processed = pd.read_csv('../data/People_Processed.csv')
print(people_df_processed[columns_with_missing_values].isna().sum())

Missing values per column:
VEHICLE_ID                10372
CITY                          0
STATE                    134364
SEX                           0
AGE                      161028
SAFETY_EQUIPMENT              0
AIRBAG_DEPLOYED               0
EJECTION                      0
INJURY_CLASSIFICATION       282
DRIVER_ACTION            108833
DRIVER_VISION            108833
PHYSICAL_CONDITION            0
BAC_RESULT               108833
DAMAGE                    74309
dtype: int64


In [6]:
#Get value counts for columns with missing values
for col in people_df[columns_with_missing_values]:
    print(people_df[col].value_counts())
    print('------')

VEHICLE_ID
332155.0    60
366311.0    44
162199.0    44
25920.0     43
412312.0    41
            ..
338497.0     1
338458.0     1
338463.0     1
338516.0     1
460661.0     1
Name: count, Length: 450064, dtype: int64
------
CITY
CHICAGO           295710
CICERO              3988
SKOKIE              3026
EVANSTON            2586
BERWYN              2430
                   ...  
CALUMETE CITY          1
NIAGARA FALLS          1
WHITNEY                1
MOINT PROSPECT         1
WYNNEWOOD              1
Name: count, Length: 6877, dtype: int64
------
STATE
IL    400648
IN      6318
XX      2534
WI      1887
MI      1635
FL       958
CA       765
TX       762
OH       757
IA       585
GA       491
MO       475
MN       454
NY       409
WA       316
TN       309
AZ       232
NC       226
CO       209
KY       199
PA       198
NJ       174
VA       172
MA       163
MD       152
AL       151
ID       131
MS       130
AR       111
NB       103
NV       103
OK       101
LA       101
KS       100


In [14]:
# Retrieve columns that provide a value containing "Unknown" substring to indicate when the information for that column is not known.
for col in people_df[columns_with_missing_values]:
    if people_df[col].astype(str).str.contains('unknown', case=False, na=False).any():
        print(f'Column name: {col}')
        print('---')
        print(f'{col} value counts')
        print(people_df[col].value_counts())
        

Column name: CITY
---
CITY value counts
CITY
CHICAGO           295710
CICERO              3988
SKOKIE              3026
EVANSTON            2586
BERWYN              2430
                   ...  
CALUMETE CITY          1
NIAGARA FALLS          1
WHITNEY                1
MOINT PROSPECT         1
WYNNEWOOD              1
Name: count, Length: 6877, dtype: int64
Column name: SAFETY_EQUIPMENT
---
SAFETY_EQUIPMENT value counts
SAFETY_EQUIPMENT
SAFETY BELT USED                   299040
USAGE UNKNOWN                      234298
NONE PRESENT                        17022
CHILD RESTRAINT USED                 6605
SAFETY BELT NOT USED                 3418
HELMET USED                          1284
HELMET NOT USED                      1100
CHILD RESTRAINT NOT USED              348
CHILD RESTRAINT USED IMPROPERLY        40
Name: count, dtype: int64
Column name: AIRBAG_DEPLOYED
---
AIRBAG_DEPLOYED value counts
AIRBAG_DEPLOYED
DID NOT DEPLOY                            374039
DEPLOYMENT UNKNOWN          

'PEOPLE' columns that provide an 'UNKNOWN' value
From the columns with missing values:

1. CITY
2. SAFETY_EQUIPMENT
3. AIRBAG_DEPLOYED
4. EJECTION
5. DRIVER_ACTION
6. DRIVER_VISION
7. PHYSICAL CONDITION
8. BAC_RESULT

## Queries to explore pairings in columns with missing values

In [8]:
# Check if there are observations where city is known but state is NaN
print('Count of observations that have the \"CITY\" column filled but not the \"STATE\" column:')
count_rows_without_states= people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_states)
if count_rows_without_states > 0:
    print('First 5 rows that have the \"CITY\" column filled but not the \"STATE\" column:')
    print(people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].head(5))

Count of observations that have the "CITY" column filled but not the "STATE" column:
9005
First 5 rows that have the "CITY" column filled but not the "STATE" column:
                 CITY STATE
727           CHICAGO   NaN
755           CHICAGO   NaN
757   LAURIER-STATION   NaN
996           CHICAGO   NaN
1003          CHICAGO   NaN


In [9]:
# Check if there are observations where state is known but city is NaN
print('Count of observations that have the \"STATE\" column filled but not the \"CITY\" column:')
count_rows_without_cities = people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_cities)
if count_rows_without_cities > 0:
    print('First 5 rows that have the \"STATE\" column filled but not the \"CITY\" column:')
    print(people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].head(5))

Count of observations that have the "STATE" column filled but not the "CITY" column:
10949
First 5 rows that have the "STATE" column filled but not the "CITY" column:
    CITY STATE
39   NaN    IL
61   NaN    IL
132  NaN    XX
199  NaN    IL
200  NaN    IL


In [10]:
# Check if there are observations where the type is "PASSENGER" and ""DRIVER_ACTION" or "DRIVER_VERSION" are filled
print('Count of observations that have the \"PERSON_TYPE\" as "PASSENGER" and the columns  \"DRIVER_ACTION\" or \"DRIVER_VERSION\" filled:')
count_passenger_with_driver_info = people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').shape[0]
print(count_passenger_with_driver_info)
if count_passenger_with_driver_info > 0:
    print(people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').head(5))

Count of observations that have the "PERSON_TYPE" as "PASSENGER" and the columns  "DRIVER_ACTION" or "DRIVER_VERSION" filled:
0


In [11]:
# Check if there are observations where the "AGE" < 10  and "PERSON_TYPE" is "DRIVER" (Anomalies)
print('Count of observations that have the "AGE" < 10 and \"PERSON_TYPE\" as "DRIVER":')
count_age_0 = people_df.query('AGE  < 10 and PERSON_TYPE == "DRIVER"').shape[0]
print(count_age_0)
if count_age_0 > 0:
    print(people_df.query('AGE < 10 and PERSON_TYPE == "DRIVER"')[['AGE', 'PERSON_TYPE']])

Count of observations that have the "AGE" < 10 and "PERSON_TYPE" as "DRIVER":
3720
        AGE PERSON_TYPE
334     0.0      DRIVER
369     0.0      DRIVER
409     0.0      DRIVER
447     0.0      DRIVER
1020    0.0      DRIVER
...     ...         ...
562405  0.0      DRIVER
562922  0.0      DRIVER
563039  0.0      DRIVER
563147  0.0      DRIVER
563321  0.0      DRIVER

[3720 rows x 2 columns]
