In [None]:
#Imports
import pandas as pd
import numpy as np
import plotly.express as px

## PEOPLE Data Exploration

In [None]:
# Load the csv file
people_df = pd.read_csv('../data/People.csv')

In [None]:
us_cities_info = pd.read_csv('../data/us_cities_info.csv')

In [None]:
# Print information
print('Info:')
people_df.info()

#Print columns
print('\nColumns:')
print(people_df.columns)

#Print shape
print('\nShape:')
print(people_df.shape)


In [None]:
#Get unique values per non-numeric columns
for column in people_df.select_dtypes(exclude=[np.number]):
    print(f"Unique values in {column}:")
    print(people_df[column].unique())
    print("\n")

In [None]:
#Check Duplicated Observations
people_df.duplicated().sum()

In [None]:
# Check entries that 'CITY' column has a numeric value
people_df['CITY_numeric'] = pd.to_numeric(people_df['CITY'], errors='coerce')
numeric_count = people_df['CITY_numeric'].notna().sum()
print(numeric_count)

## Plots

In [None]:
#List of columns to plot
columns_to_plot = [
'PERSON_TYPE',
'STATE',
'SEX',
'SAFETY_EQUIPMENT',
'AIRBAG_DEPLOYED',
'EJECTION',
'DRIVER_ACTION',
'DRIVER_VISION',
'PHYSICAL_CONDITION',
'BAC_RESULT',
'DAMAGE_CATEGORY'
]


#Plot each column
for column in columns_to_plot:
    fig = px.bar(people_df, 
                 x=column, 
                 labels={'x': column, 'y': 'Frequency'},
                 title=f'Distribution of {column}')
    # Customize hover information
    fig.update_traces(hovertemplate='Frequency: %{y}<extra></extra>')
    fig.update_xaxes(tickangle=45)
    fig.show()

In [None]:
fig = px.histogram(people_df, x='AGE', 
                   title='Age Distribution',
                   labels={'AGE': 'Age', 'count': 'Frequency'},
                   nbins=20) 
# Customize hover information
fig.update_traces(hovertemplate='Frequency: %{y}<extra></extra>')
fig.show()

## Changes

After the data exploration on "People" dataframe, I believe we should:
1. Use "CITY" column to determine "STATE" column when the latter is empty. 
2. Make following mapping for 'SEX' values: U & NaN values -> "U"(unknown)
3. For all the columns that already provide an "unknown" possible value, fill NaN observations with this value.
4. Convert "VEHICLE_ID"  & "AGE" to an integer. 
5. Add new value 'N/A' ('NON APPLICABLE') for observations that the person_type is passenger and we have missing values on columns that regard the driver ('DRIVER_VISION' & 'DRIVER_ACTION').
6. Split the 'CRASH_DATE' into 'DAY', 'MONTH', 'YEAR', 'TIME' columns
7. Use 'NaN' value for 'AGE' for observations when "AGE" < 10  and "PERSON_TYPE" is "DRIVER"
8. Make 'STATE' column have the value 'Unknown' when 'CITY' is 'UNKNOWN' or STATE == 'XX'.
9. Set 'CITY' column have the value 'Unknown' when 'city' has numeric value, length < 2 or starts with UNK