In [2]:
#Imports
import pandas as pd
import numpy as np 

### General

In [3]:
# Load the csv files
crashes_df = pd.read_csv('../data/Crashes.csv')

In [None]:
#Check shape of crashes table
crashes_df.shape

In [None]:
#Check columns of crashes table
crashes_df.columns

In [None]:
# Create a dictionary of columns with missing values and their counts
missing_values_dict = crashes_df.isna().sum()[crashes_df.isna().sum() > 0].to_dict()
missing_values_dict

In [None]:
#Check for duplicates in crashes table
crashes_df.duplicated().sum()

### Value counts on missing values columns

check all the values of each column using valueCount and check if there are columns that use a string to signal that the data is not known (e.g. weather condition just below)

In [4]:
#Check value counts for weather condition
crashes_df.groupby('WEATHER_CONDITION').size()

WEATHER_CONDITION
CLEAR                     205435
CLOUDY/OVERCAST             7573
FOG/SMOKE/HAZE               549
OTHER                        775
RAIN                       23677
SEVERE CROSS WIND GATE        49
SLEET/HAIL                   342
SNOW                        8276
UNKNOWN                    11249
dtype: int64

In [None]:
#Check value counts for all columns
for col in crashes_df.columns:
    print (col)
    print (crashes_df[col].value_counts())
    print ("_______________________________________________________")

The columns that use a replacement string (e.g Unknown) for null values are:
- TRAFFIC_CONTROL_DEVICE
- DEVICE_CONDITION
- WEATHER_CONDITION
- LIGHTING_CONDITION
- TRAFFICWAY_TYPE
- ROADWAY_SURFACE_COND
- ROAD_DEFECT

In [None]:
#List of columns that use a replacement string to signal missing values
columns_with_strings_to_signal_missing_values = [
    'TRAFFIC_CONTROL_DEVICE',
    'DEVICE_CONDITION',
    'WEATHER_CONDITION',
    'LIGHTING_CONDITION',
    'TRAFFICWAY_TYPE',
    'ROADWAY_SURFACE_COND',
    'ROAD_DEFECT'
]

# Iterating over the list of columns with replacement strings to print value counts
for col in columns_with_strings_to_signal_missing_values:
    print(f"Value counts for {col}:")
    print(crashes_df[col].value_counts())
    print("_______________________________________________________")


### Columns to plot

- CRASH_HOUR
- CRASH_DAY_OF_WEEK
- CRASH_MONTH
- POSTED_SPEED_LIMIT
- TRAFFIC_CONTROL_DEVICE
- WEATHER_CONDITION 
- LIGHTING_CONDITION
- FIRST_CRASH_TYPE
- TRAFFICWAY_TYPE
- ALIGNMENT
- ROADWAY_SURFACE_COND
- ROAD_DEFECT
- MOST_SEVERE_INJURY
- PRIM_CONTRIBUTORY_CAUSE
- SEC_CONTRIBUTORY_CAUSE

In [None]:
import matplotlib.pyplot as plt

# List of columns to plot
columns_to_plot = [
    'CRASH_HOUR',
    'CRASH_DAY_OF_WEEK',
    'CRASH_MONTH',
    #'CRASH_DATE', #too many values and is already integrated in the other 3 columns above
    'POSTED_SPEED_LIMIT',
    'TRAFFIC_CONTROL_DEVICE',
    'WEATHER_CONDITION',
    'LIGHTING_CONDITION',
    'FIRST_CRASH_TYPE',
    'TRAFFICWAY_TYPE',
    'ALIGNMENT',
    'ROADWAY_SURFACE_COND',
    'ROAD_DEFECT',
    'MOST_SEVERE_INJURY',
    'PRIM_CONTRIBUTORY_CAUSE',
    'SEC_CONTRIBUTORY_CAUSE'
]

# Plotting each column
for column in columns_to_plot:
    value_counts = crashes_df[column].value_counts().sort_values()
    plt.figure(figsize=(10, 6))
    value_counts.plot(kind='bar')
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

the conclusion that come from the graph are:
- the hours where most of the crashes happen is between 7 AM and 7 PM with a peak in the early afternoon (3PM to 5PM); one can see that 7AM and 7PM are the usual hours where people get in/out of work so it makes sense for more traffic.

- the days of the week do not impact much on the number of crashes

- there are more crashes during autumn/winter

- there is a very noticeable peak of incidents based on the speed limit, with special regards to the 30 limit, still all really relevant data is contained between 0 and 45

- other than when there are no controls the traffic signal and a very similar alternative such as the stop sign/flasher are the traffic control devices where most of the crashes happen

- the weather conditions have almost a contrary impact on the number of incidents, where clear weather means more crashes. most likely people are more attentive to their driving when the weather is not optimal

- the same thing that is valid for the weather condition is also valid for the lighting conditions, there are more crashes in the daylight than in the darkness/lighted road and all the other conditions combined

- there is a peak of crashes when the road trafficway type is not divided

- most of the accidents happen when the alignment of the road is straight and level (basically only that), but it is important to remember that the roads of Chicago don't change much in altitude (https://chatgpt.com/share/672beffe-678c-8003-9e93-e42a98c877dd (sources included)); maybe this column should be removed

- as previously noted, when the conditions are best the drivers make the most mistakes, in fact when the roadway surface conditions are dry, rather than wet, snowy, ice, sand and other, the drivers have the most crushes for lack of attention to the road. Can the features that have this similar distribution be aggregated?

- same as conditions of the surface also road defect get the same results.

### features that can be aggregated

- INJURIES_FATAL
- INJURIES_INCAPACITATING
- INJURIES_NON_INCAPACITATING
- INJURIES_REPORTED_NOT_EVIDENT
- INJURIES_NO_INDICATION
- INJURIES_UNKNOWN
after aggregating those INJURIES_TOTAL would be remvoed

the values in all those features space in values in weird ways but i would propose to aggregate all the values in them into one single value for each type and put them into a scale to keep an order for them

so maybe we would have
- INJURIES_UNKNOWN 0
- INJURIES_NO_INDICATION 1
- INJURIES_REPORTED_NOT_EVIDENT 2
- INJURIES_NON_INCAPACITATING 3
- INJURIES_INCAPACITATING 4
- INJURIES_FATAL 5

-------------------------

'CRASH_DATE' is an aggregation of: 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'?

-------------------------
search for other

### filling the values

need to find street direction since the similar named column in the vehicles dataset didnt yeld any good results (REMOVING FROM THE DATASET INSTEAD)

might be able to get LATITUDE, LONGITUDE and LOCATION using the infos in STREET_NAME (and viceversa for the single street name value present) (REMOVING BECAUSE BEAT_OF_OCCURRENCE INTEGRATES THEM ALL)

remember to check if the street name have a null value in them (just check for the frequency of the street names, they should all be different(?), so if there is one repeating it probably is a mock value)

In [None]:
print(crashes_df["STREET_NAME"].value_counts())

In [None]:
# Filter the DataFrame for rows where STREET_NAME is 'WESTERN AVE'
western_ave_street_no_counts = crashes_df[crashes_df['STREET_NAME'] == 'WESTERN AVE']['STREET_NO'].value_counts()

# Display the counts
print(western_ave_street_no_counts)

everything seems regular

### correlation matrix

In [None]:
# Add this code to plot a correlation matrix
import seaborn as sns

# Calculate the correlation matrix
correlation_matrix = crashes_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})

# Set the title
plt.title('Correlation Matrix')
plt.show()

so we find that:
- most of the injuries are non incapacitating
- there is a strong correlation between BEAT_OF_OCCURRENCE and LATITUDE and LONGITUDE
- for the rest of the features there is not that much correlation, even tho, if the dataset was to be unified it would be interesting to check for the correlation matrix of all the dataset togheter

BEAT_OF_OCCURRENCE is described as (in https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if/about_data):
	
Chicago Police Department Beat ID. Boundaries available at https://data.cityofchicago.org/d/aerh-rz74 

as described by gpt:

In Chicago, "police beats" are specific geographic areas within each police district. Each beat is assigned a dedicated police team responsible for routine patrols and responding to incidents. This organization allows officers to become familiar with the communities within their beat, enhancing local policing efforts and accountability. The City of Chicago’s data portal provides boundaries and GIS data for these police beats.

does this mean that we can drop LONGITUDE, LATITUDE and LOCATION? (REMEMBER THAT THERE ARE 4 MISSING VALUES FOR BEAT_OF_OCCURRENCE)

### data type check

In [None]:
for col in crashes_df.columns:
    print(f"Column: {col}")
    print(f"Data Type: {crashes_df[col].dtype}")
    print(f"Unique Values: {crashes_df[col].unique()}")
    print("_______________________________________________________")


need to check if there are digits in the RD_NO column

In [None]:
crashes_df["RD_NO"]

the values seem to have a pattern, let's check if there is any value that doesn't match this pattern

In [None]:
crashes_df[~crashes_df['RD_NO'].str.match(r'^[A-Z]{2}\d{6}$')]["RD_NO"]

the only 2 license plate that are not exactly matching the format are those 2 with lower case letters

_____________________

need to check any weird value (does not match the formatting of the others) in LOCATION

In [None]:
crashes_df["LOCATION"][0]

since all the values, at a first look, in the location column (except for the nan ones) are a string with this format
- 'POINT (-87.716439109795 41.894718028422)'

let's check if there are any different values

In [None]:
# Check for values that do not match the expected format
not_nan_locations = crashes_df.dropna(subset=['LOCATION'])
not_nan_locations[~not_nan_locations['LOCATION'].str.match(r"^POINT \(-?\d+\.\d+ -?\d+\.\d+\)$")]["LOCATION"]

there is no value that doesnt match the location that was required

_______________



need to check if there are not date values in
- CRASH_DATE,
- DATE_POLICE_NOTIFIED

In [None]:
# Check if all values in CRASH_DATE match the pattern
crash_date_pattern = r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}:\d{2} [AP]M$"
crash_date_check = crashes_df['CRASH_DATE'].str.match(crash_date_pattern)

# Check if all values in DATE_POLICE_NOTIFIED match the pattern
date_police_notified_pattern = r"^\d{2}/\d{2}/\d{4} \d{1,2}:\d{2}:\d{2} [AP]M$"
date_police_notified_check = crashes_df['DATE_POLICE_NOTIFIED'].str.match(date_police_notified_pattern)

# Print results
print("CRASH_DATE matches pattern:", crash_date_check.all())
print("DATE_POLICE_NOTIFIED matches pattern:", date_police_notified_check.all())


(checked manually all the other types in the list in the output above and everything matches for sure (few values), other then the report one above which we're gonna check now)

columns that are float64 but might aswell be int64

In [45]:
float_columns = crashes_df.select_dtypes(include=['float64']).columns.tolist()

now we convert the float to int when the values in the column are actually integers

In [None]:
for col in float_columns:
    # Check if the column contains any NaN or infinite values
    if crashes_df[col].isnull().any() or np.isinf(crashes_df[col]).any():
        print(f"Column {col} contains NaN or infinite values, should skip conversion.")
        continue  # Skip this column if it has NaN or infinite values


    # Check if all values are whole numbers
    if (crashes_df[col] == crashes_df[col].astype('int')).all():
        print(f"Column {col} can be converted to int64.")
        

### weird values

In [None]:
crashes_df.columns

In [None]:
# Select numeric columns
numeric_columns = crashes_df.select_dtypes(include=[np.number]).columns

# Iterate over each numeric column and sort values from max to min
for col in numeric_columns:
    print(f"Sorted values for {col} (max to min):")
    sorted_values = crashes_df[col].sort_values(ascending=False)
    print(sorted_values)
    print("_______________________________________________________")

no weird values in numeric columns

In [None]:
non_numeric_columns = [item for item in list(crashes_df.columns) if item not in numeric_columns]
non_numeric_columns

In [None]:
for col in non_numeric_columns:
    print(f"Unique values in {col}:\n")
    print(crashes_df[col].value_counts())
    print("_______________________________________________________")

no weird values in the other columns, still need to check the date ones

In [None]:
dates_columns = ["CRASH_DATE", "DATE_POLICE_NOTIFIED"]

In [None]:
# Convert the 'CRASH_DATE' column to datetime
crashes_df['DATE_POLICE_NOTIFIED_datetime'] = pd.to_datetime(crashes_df['DATE_POLICE_NOTIFIED'], format='%m/%d/%Y %I:%M:%S %p')

# Sort the dates from max to min
df_sorted2 = crashes_df.sort_values(by='DATE_POLICE_NOTIFIED_datetime', ascending=False)

# Display the sorted DataFrame
df_sorted2["CRASH_DATE_datetime"]

In [None]:
# Convert the 'CRASH_DATE' column to datetime
crashes_df['CRASH_DATE_datetime'] = pd.to_datetime(crashes_df['CRASH_DATE'], format='%m/%d/%Y %I:%M:%S %p')

# Sort the dates from max to min
df_sorted = crashes_df.sort_values(by='CRASH_DATE_datetime', ascending=False)

# Display the sorted DataFrame
df_sorted["CRASH_DATE_datetime"]

No weird dates found