In [38]:
#Imports
import pandas as pd
import numpy as np

In [39]:
# Load the csv file
people_df = pd.read_csv('../LDS24 - Data/People.csv')

In [40]:
# Exploration of "People" dataframe

# Print information
print('Info:')
people_df.info()

#Print columns
print('\nColumns:')
print(people_df.columns)

#Print shape
print('\nShape:')
print(people_df.shape)


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564565 entries, 0 to 564564
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   PERSON_ID              564565 non-null  object 
 1   PERSON_TYPE            564565 non-null  object 
 2   RD_NO                  564565 non-null  object 
 3   VEHICLE_ID             553869 non-null  float64
 4   CRASH_DATE             564565 non-null  object 
 5   CITY                   420645 non-null  object 
 6   STATE                  422589 non-null  object 
 7   SEX                    557411 non-null  object 
 8   AGE                    403584 non-null  float64
 9   SAFETY_EQUIPMENT       563155 non-null  object 
 10  AIRBAG_DEPLOYED        554113 non-null  object 
 11  EJECTION               557860 non-null  object 
 12  INJURY_CLASSIFICATION  564269 non-null  object 
 13  DRIVER_ACTION          454799 non-null  object 
 14  DRIVER_VISION          454681 

In [41]:
#Get unique values per non-numeric columns
for column in people_df.select_dtypes(exclude=[np.number]):
    print(f"Unique values in {column}:")
    print(people_df[column].unique())
    print("\n")


Unique values in PERSON_ID:
['O561555' 'O561563' 'O561564' ... 'O24496' 'O481321' 'P108071']


Unique values in PERSON_TYPE:
['DRIVER' 'PASSENGER' 'PEDESTRIAN' 'BICYCLE' 'NON-MOTOR VEHICLE'
 'NON-CONTACT VEHICLE']


Unique values in RD_NO:
['JC113649' 'JC113627' 'JC113637' ... 'HZ164689' 'HZ122950' 'JB442550']


Unique values in CRASH_DATE:
['01/12/2019 12:01:00 AM' '01/11/2019 11:36:00 PM'
 '01/11/2019 11:31:00 PM' ... '02/24/2014 07:45:00 PM'
 '01/21/2014 07:40:00 AM' '01/18/2014 06:14:00 PM']


Unique values in CITY:
[nan 'CHICAGO' 'BERWYN' ... 'MORTAN GROVE' 'FORST' 'WYNNEWOOD']


Unique values in STATE:
[nan 'IL' 'IN' 'XX' 'MI' 'TX' 'CO' 'GA' 'FL' 'NJ' 'IA' 'MO' 'NY' 'MA' 'CA'
 'AR' 'MN' 'WI' 'WA' 'LA' 'MD' 'ME' 'OH' 'UT' 'NC' 'NV' 'TN' 'DE' 'VA'
 'AZ' 'MS' 'KS' 'PA' 'SC' 'ID' 'WY' 'KY' 'DC' 'NB' 'OK' 'NM' 'AL' 'MT'
 'HI' 'OR' 'RI' 'ND' 'WV' 'NH' 'CT' 'SD' 'AK' 'VT']


Unique values in SEX:
['X' 'M' 'F' nan 'U']


Unique values in SAFETY_EQUIPMENT:
['USAGE UNKNOWN' 'SAFETY BELT US

In [42]:
#Check missing values
print('Missing values per column:')
print(people_df.isna().sum())

columns_with_missing_values = people_df.columns[people_df.isnull().any()].tolist()
print('\nColumns with missing values:')
print(columns_with_missing_values)

Missing values per column:
PERSON_ID                     0
PERSON_TYPE                   0
RD_NO                         0
VEHICLE_ID                10696
CRASH_DATE                    0
CITY                     143920
STATE                    141976
SEX                        7154
AGE                      160981
SAFETY_EQUIPMENT           1410
AIRBAG_DEPLOYED           10452
EJECTION                   6705
INJURY_CLASSIFICATION       296
DRIVER_ACTION            109766
DRIVER_VISION            109884
PHYSICAL_CONDITION       109501
BAC_RESULT               108833
DAMAGE_CATEGORY               0
DAMAGE                    74309
dtype: int64

Columns with missing values:
['VEHICLE_ID', 'CITY', 'STATE', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'BAC_RESULT', 'DAMAGE']


In [43]:
# Check if there are observations where city is known but state is NaN
print('Count of observations that have the \"CITY\" column filled but not the \"STATE\" column:')
count_rows_without_states= people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_states)
if count_rows_without_states > 0:
    print('First 5 rows that have the \"CITY\" column filled but not the \"STATE\" column:')
    print(people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].head(5))

# Check if there are observations where state is known but city is NaN
print('Count of observations that have the \"STATE\" column filled but not the \"CITY\" column:')
count_rows_without_cities = people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_cities)
if count_rows_without_cities > 0:
    print('First 5 rows that have the \"STATE\" column filled but not the \"CITY\" column:')
    print(people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].head(5))
    

# Check if there are observations where the type is "PASSENGER" and ""DRIVER_ACTION" or "DRIVER_VERSION" are filled
print('Count of observations that have the \"PERSON_TYPE\" as "PASSENGER" and the columns  \"DRIVER_ACTION\" or \"DRIVER_VERSION\"  filled')
count_passenger_with_driver_info = people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').shape[0]
print(count_passenger_with_driver_info)
if count_rows_without_cities > 0:
    print(people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').head(5))

Count of observations that have the "CITY" column filled but not the "STATE" column:
9005
First 5 rows that have the "CITY" column filled but not the "STATE" column:
                 CITY STATE
727           CHICAGO   NaN
755           CHICAGO   NaN
757   LAURIER-STATION   NaN
996           CHICAGO   NaN
1003          CHICAGO   NaN
Count of observations that have the "STATE" column filled but not the "CITY" column:
10949
First 5 rows that have the "STATE" column filled but not the "CITY" column:
    CITY STATE
39   NaN    IL
61   NaN    IL
132  NaN    XX
199  NaN    IL
200  NaN    IL
Count of observations that have the "PERSON_TYPE" as "PASSENGER" and the columns  "DRIVER_ACTION" or "DRIVER_VERSION"  filled
0
Empty DataFrame
Columns: [PERSON_ID, PERSON_TYPE, RD_NO, VEHICLE_ID, CRASH_DATE, CITY, STATE, SEX, AGE, SAFETY_EQUIPMENT, AIRBAG_DEPLOYED, EJECTION, INJURY_CLASSIFICATION, DRIVER_ACTION, DRIVER_VISION, PHYSICAL_CONDITION, BAC_RESULT, DAMAGE_CATEGORY, DAMAGE]
Index: []
