In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
data_filepath = "..\\dataset\\new dataset.xlsx"
df = pd.read_excel(data_filepath)
print(df.head())

   Year  Month  Day Weekend?    Hour Collision Type         Injury Type  \
0  2015      1    5  Weekday     0.0          2-Car   No injury/unknown   
1  2015      1    6  Weekday  1500.0          2-Car   No injury/unknown   
2  2015      1    6  Weekend  2300.0          2-Car  Non-incapacitating   
3  2015      1    7  Weekend   900.0          2-Car  Non-incapacitating   
4  2015      1    7  Weekend  1100.0          2-Car   No injury/unknown   

                          Primary Factor      Reported_Location   Latitude  \
0  OTHER (DRIVER) - EXPLAIN IN NARRATIVE             1ST & FESS  39.159207   
1                  FOLLOWING TOO CLOSELY          2ND & COLLEGE  39.161440   
2              DISREGARD SIGNAL/REG SIGN  BASSWOOD & BLOOMFIELD  39.149780   
3          FAILURE TO YIELD RIGHT OF WAY         GATES & JACOBS  39.165655   
4          FAILURE TO YIELD RIGHT OF WAY                  W 3RD  39.164848   

   Longitude  
0 -86.525874  
1 -86.534848  
2 -86.568890  
3 -86.575956  
4 -86

# Data Cleaning
**Feature Selection**
Provided columns are 'Year', 'Month', 'Day', 'Weekend?', 'Hour', 'Collision Type', 'Injury Type','Primary Factor', 'Reported_Location', 'Latitude', 'Longitude'.

Considering we only care about when/where accidents happen, we can remove all features that don't give use insight into these two factors. That means we can remove the 'Collision Type', 'Injury Type', and 'Primary Factor' features. We can also remove 'Year' since we want this model to generalize for any years. In further versions of this model once could use the year, month, and date to determine the weather at the time of crash and factor this feature into the model.

**Removing Rows with Empty Values**
We will also drop any rows with empty values in the selected features

In [20]:
df = df.drop(columns=['Collision Type', 'Injury Type', 'Primary Factor', 'Year'])
df = df.dropna()
print(df.head())

   Month  Day Weekend?    Hour      Reported_Location   Latitude  Longitude
0      1    5  Weekday     0.0             1ST & FESS  39.159207 -86.525874
1      1    6  Weekday  1500.0          2ND & COLLEGE  39.161440 -86.534848
2      1    6  Weekend  2300.0  BASSWOOD & BLOOMFIELD  39.149780 -86.568890
3      1    7  Weekend   900.0         GATES & JACOBS  39.165655 -86.575956
4      1    7  Weekend  1100.0                  W 3RD  39.164848 -86.579625


# Normalizing Data
**Normalizing Hours**

Disregarding decimal points, Hour values are currently 3-4 characters with the least significant 2 digits being minutes (always 00 in this dataset) and the remaining significant bits denoting hours. We will remove decimal point and remove minutes integers so that the only remaining number is what number hour it is (from 0 to 23).

In [21]:
# We must normalize all values in hour column such that it is 4 integers indicating the format (HH:MM)
df['Hour'] = df['Hour'].astype(int).astype(str).str.zfill(4)

# get just the HH values (indicates which of the 24 buckets the value goes into)
df['Hour'] = df['Hour'].str[:2].astype(int)

**Encoding Weekend**

Encode "yes weekend" to 1 and "no weekend" to 0 so we can use this feature in our neural network

In [22]:
weekend_mapping = {'Weekday':0, 'Weekend':1}
df['Weekend?'] = df['Weekend?'].replace(weekend_mapping)
print(df.head())

   Month  Day  Weekend?  Hour      Reported_Location   Latitude  Longitude
0      1    5         0     0             1ST & FESS  39.159207 -86.525874
1      1    6         0    15          2ND & COLLEGE  39.161440 -86.534848
2      1    6         1    23  BASSWOOD & BLOOMFIELD  39.149780 -86.568890
3      1    7         1     9         GATES & JACOBS  39.165655 -86.575956
4      1    7         1    11                  W 3RD  39.164848 -86.579625


In [25]:
# Calculate Probabilities
total_crashes = df.shape[0]
total_prob = []

hour_prob = {}
day_prob = {}
month_prob = {}
weekend_prob = {}

# probability for each hour
for hour in range(24):
    occurrences_in_hour = df['Hour'].value_counts()[hour]
    hour_prob[hour] = occurrences_in_hour/total_crashes

for day in range(1,8):
    occurrences_in_day = df['Day'].value_counts()[day]
    day_prob[day] = occurrences_in_day/total_crashes

months = df['Month'].unique()
for month in months:
    occurrences_in_month = df['Month'].value_counts()[month]
    month_prob[month] = occurrences_in_month/total_crashes

for weekend in range(2):
    occurrences_in_weekend = df['Weekend?'].value_counts()[weekend]
    weekend_prob[weekend] = occurrences_in_weekend/total_crashes
    
# Calculate total probability for each crash
    
# when calculating the total probability, the value gets extremely small due to multiplying
    # probabilities. To reduce the effect of this we'll use log probabilities which shouldn't
    # affect effectiveness of our product since we're simply comparing probabilities
def CalculateTotalProb(row):
    probabilities = [hour_prob[row['Hour']],
                     day_prob[row['Day']],
                     month_prob[row['Month']],
                     weekend_prob[row['Weekend?']]]
    log_probabilities = np.log(probabilities)
    log_combined_probability  = np.sum(log_probabilities)
    return log_combined_probability * -1

df['total_prob'] = df.apply(CalculateTotalProb, axis=1)

print(df)

       Month  Day  Weekend?  Hour      Reported_Location   Latitude  \
0          1    5         0     0             1ST & FESS  39.159207   
1          1    6         0    15          2ND & COLLEGE  39.161440   
2          1    6         1    23  BASSWOOD & BLOOMFIELD  39.149780   
3          1    7         1     9         GATES & JACOBS  39.165655   
4          1    7         1    11                  W 3RD  39.164848   
...      ...  ...       ...   ...                    ...        ...   
53938     10    6         0    17  DUNN & WHITE LOT WEST   0.000000   
53939     11    3         0     8        RED OAK & SR446   0.000000   
53940     12    5         0    12        2ND ST & WALNUT   0.000000   
53941     12    1         1     7         NINETH & NORTH   0.000000   
53942     12    7         1    17      MONROW & THIRD ST   0.000000   

       Longitude  total_prob  
0     -86.525874    8.503427  
1     -86.534848    6.979846  
2     -86.568890    9.381804  
3     -86.575956    8.9

# Save Modified Data

Will save modified data to a CSV file for use by our model

In [26]:
df.to_csv('.\\modified_data\\cleaned_data.csv', index=False)