In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.style.use(['ggplot'])

# Data Understanding

## Label Selection

In [None]:
# download the dataset
!wget -O DataCollision.csv https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv

--2020-09-10 18:22:40--  https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv
Resolving s3.us.cloud-object-storage.appdomain.cloud (s3.us.cloud-object-storage.appdomain.cloud)... 67.228.254.196
Connecting to s3.us.cloud-object-storage.appdomain.cloud (s3.us.cloud-object-storage.appdomain.cloud)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73917638 (70M) [text/csv]
Saving to: ‘DataCollision.csv’

DataCollision.csv    21%[===>                ]  14.96M   504KB/s    eta 2m 22s 

In [None]:
# read the dataset into dataframe 
df = pd.read_csv('DataCollision.csv')
print(df.shape)
df.head(2)

In [None]:
df.groupby(['SEVERITYDESC'])['SEVERITYCODE'].value_counts()

In [None]:
df.SEVERITYCODE.value_counts().plot(kind='bar', color='r')
plt.title('Collisions Severity')
plt.xlabel('Severity Code')
plt.ylabel('Number of Collisions')

## Feature Selection

In [None]:
#Dropping irrelevant unique IDs
df = df.drop(columns=["SEVERITYDESC", "SEVERITYCODE.1", "EXCEPTRSNCODE",
                      "EXCEPTRSNDESC", "INCKEY", "COLDETKEY", "INTKEY", 
                      "SEGLANEKEY", "CROSSWALKKEY", "REPORTNO", "OBJECTID"])

df.head(2)

In [None]:
#Dropping irrelevant features.
df = df.drop(columns=["ADDRTYPE", "LOCATION", "PEDROWNOTGRNT", 
                      "HITPARKEDCAR", "SDOT_COLCODE","SDOT_COLDESC", 
                      "SDOTCOLNUM", "ST_COLDESC",
                      "PERSONCOUNT", "PEDCOUNT", "PEDCYLCOUNT", 
                      "COLLISIONTYPE", "VEHCOUNT", "STATUS", "INCDATE"])

df.head(2)

## Data Cleaning

In [None]:
print("Longitude has", df["X"].isna().sum(), "missing values.")
print("Latitude has", df["Y"].isna().sum(), "missing values.")

In [None]:
print("Longitude has a mean of %.5f" % df["X"].mean(), ".")
print("Latitude has a mean o %.5f" % df["Y"].mean(), ".")

In [None]:
avg_X = df["X"].astype("float").mean()
df["X"].fillna(avg_X, inplace=True)
avg_Y = df["Y"].astype("float").mean()
df["Y"].fillna(avg_Y, inplace=True)
print(df["X"].value_counts())
print(df["X"].isna().sum())

## Exploratory Data Analysis

### Relationship between location and collision severity

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

# define the world map centered around Seattle with a high zoom level
seattle_map = folium.Map(location=[47.6062, -122.3321], zoom_start=12)

df["XY"] = df["X"].astype(str) + ", " + df["Y"].astype(str)

#This counts the number of times a collision has occured in each location with coordinates XY and assigns
#the "hotspots" where there has been more than 100 collisions.
v = df.XY.value_counts()
collisions = df[df.XY.isin(v.index[v.gt(100)])]

# loop through collisions and add each to the map
numCollisions = len(collisions.index)
numCollisionsAddedToMap = 0
for lat, lng in zip(collisions["Y"], collisions["X"]):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(seattle_map)
    
    numCollisionsAddedToMap = numCollisionsAddedToMap + 1

    print("Loading: " + str(round(((numCollisionsAddedToMap / numCollisions) * 100), 2)) + "%", end="\r")


#obtaining the collisions hotspots comprising of class 2 collisions.    
class2Collisions = collisions[collisions.SEVERITYCODE == 2]

#loop through all class 2 collisions and add each to the map in red.
numCollisions = len(class2Collisions.index)
numCollisionsAddedToMap = 0
for lat, lng in zip(class2Collisions["Y"], class2Collisions["X"]):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.6
    ).add_to(seattle_map)
    
    numCollisionsAddedToMap = numCollisionsAddedToMap + 1

    print("Loading: " + str(round(((numCollisionsAddedToMap / numCollisions) * 100), 2)) + "%", end="\r")

#Drop the XY column as this is no longer needed
df = df.drop(columns=["XY"])

In [None]:
# show map
seattle_map

### Relationship between junction type and collision severity

In [None]:
!conda install seaborn --yes
import seaborn as sb

order = df["JUNCTIONTYPE"].value_counts().to_frame()
ax = sb.countplot(y="JUNCTIONTYPE", hue="SEVERITYCODE", order=order.index, data=df)
ax.set(xlabel='Number of Collisions', ylabel="Type of Junction")

print(df.groupby(['JUNCTIONTYPE'])['SEVERITYCODE'].value_counts())

### Relationship between the weather and collision severity

In [None]:
order = df["WEATHER"].value_counts().to_frame()
ax = sb.countplot(y="WEATHER", hue="SEVERITYCODE", order=order.index, data=df)
ax.set(xlabel='Number of Collisions', ylabel="Weather Condition")

print(df.groupby(['SEVERITYCODE'])['WEATHER'].value_counts())

In [None]:
df["WEATHER"].replace(np.NaN, "Clear", inplace=True)
df["WEATHER"].replace("Other", "Unknown", inplace=True)
df["WEATHER"].replace("Unknown", "Unknown Weather", inplace=True)
df["WEATHER"].replace("Partly Cloudy", "Overcast", inplace=True)
df = pd.concat([df,pd.get_dummies(df['WEATHER'])], axis=1)
df.drop(["WEATHER"],axis=1, inplace=True)
df.head()

### Relationship between the road conditions and the collision severity

In [None]:
order = df["ROADCOND"].value_counts().to_frame()
ax = sb.countplot(y="ROADCOND", hue="SEVERITYCODE", order=order.index, data=df)
ax.set(xlabel='Number of Collisions', ylabel="Road Condition")

print(df.groupby(['SEVERITYCODE'])['ROADCOND'].value_counts())

In [None]:
df["ROADCOND"].replace(np.NaN, "Dry", inplace=True)
df["ROADCOND"].replace("Other", "Unknown", inplace=True)
df["ROADCOND"].replace("Unknown", "Unknown Roadcond", inplace=True)
df = pd.concat([df,pd.get_dummies(df['ROADCOND'])], axis=1)
df.drop(["ROADCOND"],axis=1, inplace=True)
df.head()