In [1]:
#Imports
import pandas as pd
import numpy as np
import plotly.express as px

## PEOPLE DATASET EXPLORATION

In [2]:
# Load the csv file
people_df = pd.read_csv('../../LDS24 - Data/People.csv')

In [11]:
# Print information
print('Info:')
people_df.info()

#Print columns
print('\nColumns:')
print(people_df.columns)

#Print shape
print('\nShape:')
print(people_df.shape)


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564565 entries, 0 to 564564
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   PERSON_ID              564565 non-null  object 
 1   PERSON_TYPE            564565 non-null  object 
 2   RD_NO                  564565 non-null  object 
 3   VEHICLE_ID             553869 non-null  float64
 4   CRASH_DATE             564565 non-null  object 
 5   CITY                   420645 non-null  object 
 6   STATE                  422589 non-null  object 
 7   SEX                    557411 non-null  object 
 8   AGE                    403584 non-null  float64
 9   SAFETY_EQUIPMENT       563155 non-null  object 
 10  AIRBAG_DEPLOYED        554113 non-null  object 
 11  EJECTION               557860 non-null  object 
 12  INJURY_CLASSIFICATION  564269 non-null  object 
 13  DRIVER_ACTION          454799 non-null  object 
 14  DRIVER_VISION          454681 

In [None]:
#Get unique values per non-numeric columns
for column in people_df.select_dtypes(exclude=[np.number]):
    print(f"Unique values in {column}:")
    print(people_df[column].unique())
    print("\n")

In [None]:
#Check Duplicated Observations
people_df.duplicated().sum()

In [None]:
#Check if there are missing values and how many
print('Missing values per column:')
print(people_df.isna().sum())

# Print columns with missing values
columns_with_missing_values = people_df.columns[people_df.isnull().any()].tolist()
print('\nColumns with missing values:')
print(columns_with_missing_values)

In [None]:
#Get value counts for columns with missing values
for col in people_df[columns_with_missing_values]:
    print(people_df[col].value_counts())
    print('------')

In [None]:
# Retrieve columns that provide a value containing "Unknown" substring to indicate when the information for that column is not known.
for col in people_df[columns_with_missing_values]:
    if people_df[col].astype(str).str.contains('unknown', case=False, na=False).any():
        print(f'Column name: {col}')
        print('---')

'PEOPLE' columns that provide an 'UNKNOWN' value
From the columns with missing values:

1. CITY
2. SAFETY_EQUIPMENT
3. AIRBAG_DEPLOYED
4. EJECTION
5. DRIVER_ACTION
6. DRIVER_VISION
7. PHYSICAL CONDITION
8. BAC_RESULT

## Queries to explore potential value pairings

In [None]:
# Check if there are observations where city is known but state is NaN
print('Count of observations that have the \"CITY\" column filled but not the \"STATE\" column:')
count_rows_without_states= people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_states)
if count_rows_without_states > 0:
    print('First 5 rows that have the \"CITY\" column filled but not the \"STATE\" column:')
    print(people_df[columns_with_missing_values].query('CITY == CITY and STATE != STATE')[['CITY', 'STATE']].head(5))

In [None]:
# Check if there are observations where state is known but city is NaN
print('Count of observations that have the \"STATE\" column filled but not the \"CITY\" column:')
count_rows_without_cities = people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].shape[0]
print(count_rows_without_cities)
if count_rows_without_cities > 0:
    print('First 5 rows that have the \"STATE\" column filled but not the \"CITY\" column:')
    print(people_df[columns_with_missing_values].query('CITY != CITY and STATE == STATE')[['CITY', 'STATE']].head(5))

In [None]:
# Check if there are observations where the type is "PASSENGER" and ""DRIVER_ACTION" or "DRIVER_VERSION" are filled
print('Count of observations that have the \"PERSON_TYPE\" as "PASSENGER" and the columns  \"DRIVER_ACTION\" or \"DRIVER_VERSION\" filled:')
count_passenger_with_driver_info = people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').shape[0]
print(count_passenger_with_driver_info)
if count_passenger_with_driver_info > 0:
    print(people_df.query('PERSON_TYPE == "PASSENGER" and (DRIVER_ACTION == DRIVER_ACTION or DRIVER_VISION == DRIVER_VISION)').head(5))

Count of observations that have the "PERSON_TYPE" as "PASSENGER" and the columns  "DRIVER_ACTION" or "DRIVER_VERSION"  filled
0


In [None]:
# Check if there are observations where the "AGE" < 10  and "PERSON_TYPE" is "DRIVER" (Anomalies)
print('Count of observations that have the "AGE" < 10 and \"PERSON_TYPE\" as "DRIVER":')
count_age_0 = people_df.query('AGE  < 10 and PERSON_TYPE == "DRIVER"').shape[0]
print(count_age_0)
if count_age_0 > 0:
    print(people_df.query('AGE < 10 and PERSON_TYPE == "DRIVER"')[['AGE', 'PERSON_TYPE']])

Count of observations that have the "AGE" < 10 and "PERSON_TYPE" as "DRIVER":
3720
        AGE PERSON_TYPE
334     0.0      DRIVER
369     0.0      DRIVER
409     0.0      DRIVER
447     0.0      DRIVER
1020    0.0      DRIVER
...     ...         ...
562405  0.0      DRIVER
562922  0.0      DRIVER
563039  0.0      DRIVER
563147  0.0      DRIVER
563321  0.0      DRIVER

[3720 rows x 2 columns]


## Plots

In [None]:
#List of columns to plot
columns_to_plot = [
'PERSON_TYPE',
'STATE',
'SEX',
'SAFETY_EQUIPMENT',
'AIRBAG_DEPLOYED',
'EJECTION',
'DRIVER_ACTION',
'DRIVER_VISION',
'PHYSICAL_CONDITION',
'BAC_RESULT',
'DAMAGE_CATEGORY'
]


#Plot each column
for column in columns_to_plot:
    fig = px.bar(people_df, 
                 x=column, 
                 labels={'x': column, 'y': 'Frequency'},
                 title=f'Distribution of {column}')
    # Customize hover information
    fig.update_traces(hovertemplate='Frequency: %{y}<extra></extra>')
    fig.update_xaxes(tickangle=45)
    fig.show()

In [None]:
fig = px.histogram(people_df, x='AGE', 
                   title='Age Distribution',
                   labels={'AGE': 'Age', 'count': 'Frequency'},
                   nbins=20) 
# Customize hover information
fig.update_traces(hovertemplate='Frequency: %{y}<extra></extra>')
fig.show()

## Potential changes

After the data exploration on "People" dataframe, I believe we should:
1. Integrate spatial data based on "CITY" column to determine "STATE" column when the latter is empty. 
2. Make following mapping for 'SEX' values: U & NaN values -> "U"(unknown)
3. For all the columns that already provide an "unknown" possible value, fill NaN observations with this value.
4. Convert "VEHICLE_ID"  & "AGE" to an integer.
5. Add new value 'N/A' ('NON APPLICABLE') for observations that the person_type is passenger and we have missing values on columns that regard the driver ('DRIVER_VISION' & 'DRIVER_ACTION').
6. Split the 'CRASH_DATE' into 'DAY', 'MONTH', 'YEAR', 'TIME' columns
7. Introduce either a sentinel value or 'NaN' value for 'AGE' for observations when "AGE" < 10  and "PERSON_TYPE" is "DRIVER"
