In [3]:
import pandas
import matplotlib.pyplot as plt

In [4]:
data_filepath = "..\\dataset\\new dataset.xlsx"
df = pandas.read_excel(data_filepath)
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\dataset\\new dataset.xlsx'

# Data Cleaning
**Feature Selection**
Provided columns are 'Year', 'Month', 'Day', 'Weekend?', 'Hour', 'Collision Type', 'Injury Type','Primary Factor', 'Reported_Location', 'Latitude', 'Longitude'.

Considering we only care about when/where accidents happen, we can remove all features that don't give use insight into these two factors. That means we can remove the 'Collision Type', 'Injury Type', and 'Primary Factor' features. We can also remove 'Year' since we want this model to generalize for any years. In further versions of this model once could use the year, month, and date to determine the weather at the time of crash and factor this feature into the model.

**Removing Rows with Empty Values**
We will also drop any rows with empty values in the selected features

In [None]:
df = df.drop(columns=['Collision Type', 'Injury Type', 'Primary Factor', 'Year'])
df = df.dropna()

weekend_mapping = {'Weekday':0, 'Weekend':1}
df['Weekend?'] = df['Weekend?'].replace(weekend_mapping)
print(df.head())

: 

# Normalizing Data
**Normalizing Hours**

Disregarding decimal points, Hour values are currently 3-4 characters with the least significant 2 digits being minutes (always 00 in this dataset) and the remaining significant bits denoting hours. We will remove decimal point and remove minutes integers so that the only remaining number is what number hour it is (from 0 to 23).

In [None]:
# We must normalize all values in hour column such that it is 4 integers indicating the format (HH:MM)
df['Hour'] = df['Hour'].astype(int).astype(str).str.zfill(4)

# get just the HH values (indicates which of the 24 buckets the value goes into)
df['Hour'] = df['Hour'].str[:2].astype(int)

: 

In [None]:
df.to_csv('.\\modified_data\\cleaned_data.csv', index=False)

: 

In [None]:
# Calculate Probabilities

total_crashes = df.shape[0]
total_prob = []

hour_prob = {}
day_prob = {}
month_prob = {}
weekend_prob = {}

# probability for each hour
for hour in range(24):
    occurrences_in_hour = df['Hour'].value_counts()[hour]
    hour_prob[hour] = occurrences_in_hour/total_crashes

for day in range(1,8):
    occurrences_in_day = df['Day'].value_counts()[day]
    day_prob[day] = occurrences_in_day/total_crashes

months = df['Month'].unique()
for month in months:
    occurrences_in_month = df['Month'].value_counts()[month]
    month_prob[month] = occurrences_in_month/total_crashes

for weekend in range(2):
    occurrences_in_weekend = df['Weekend?'].value_counts()[weekend]
    weekend_prob[weekend] = occurrences_in_weekend/total_crashes
    
# TODO: Calculate total probability for each crash