# Exploratory Data Analysis

<div class="alert alert-info" style="color:black">

## National Collision Database

<p>This Jupyter Notebook consists of exploratory data analysis carried out on the national collision database using Python's ALTAIR:</p>

<ul>
  <li><a href="https://github.com/UBC-MDS/Collision_Prediction">
      Link to the Project Repo on Github.com.</a></li>
  <li>Viualization1 placeholder</li>
  <li>Viualization2 placeholder</li>
  <li>Viualization3 placeholder</li>
</ul>
</div>

In [8]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.model_selection import train_test_split

alt.data_transformers.enable('data_server')
alt.renderers.enable('mimetype');

In [None]:
#pip install altair_data_server

## Read in the data set

In [5]:
# Getting list of column names from NCDB
col_names = pd.read_csv('data/NCDB_2017.csv', nrows=0).columns

# Creating custom data type dictionary for NCDB columns
types_dict = {'C_YEAR': int, 'C_SEV': int, 'C_CASE': int}
types_dict.update({col: str for col in col_names if col not in types_dict})

# Importing NCDB
ncdb = pd.read_csv('data/NCDB_2017.csv', dtype=types_dict)

## Summary of the data set

In [4]:
# Viewing top 5 rows
ncdb.head()

Unnamed: 0,C_YEAR,C_MNTH,C_WDAY,C_HOUR,C_SEV,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,...,V_TYPE,V_YEAR,P_ID,P_SEX,P_AGE,P_PSN,P_ISEV,P_SAFE,P_USER,C_CASE
0,2017,1,1,10,2,1,2,03,1,3,...,1,UUUU,1,M,75,11,2,NN,1,2455950
1,2017,1,1,12,2,1,4,UU,1,5,...,1,UUUU,1,F,21,11,2,02,1,2455959
2,2017,1,1,0,2,1,3,UU,7,3,...,1,UUUU,1,F,34,11,2,02,1,2455998
3,2017,1,1,17,2,2,21,UU,1,1,...,1,UUUU,1,F,50,11,2,02,1,2456104
4,2017,1,1,17,2,2,21,UU,1,1,...,1,UUUU,1,M,63,11,1,NN,1,2456104


In [5]:
# Viewing bottom 5 rows
ncdb.tail()

Unnamed: 0,C_YEAR,C_MNTH,C_WDAY,C_HOUR,C_SEV,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,...,V_TYPE,V_YEAR,P_ID,P_SEX,P_AGE,P_PSN,P_ISEV,P_SAFE,P_USER,C_CASE
289836,2017,UU,U,UU,2,UU,UU,1,U,U,...,1,UUUU,1,F,20,11,1,NN,1,2455828
289837,2017,UU,U,UU,2,UU,UU,1,U,U,...,1,UUUU,1,F,47,11,1,NN,1,2455828
289838,2017,UU,U,UU,2,UU,UU,1,U,U,...,7,UUUU,1,M,24,11,1,NN,1,2455828
289839,2017,UU,U,23,2,01,03,1,1,1,...,16,UUUU,1,M,45,96,2,09,U,2570233
289840,2017,UU,U,23,2,01,03,1,1,1,...,16,UUUU,2,F,45,96,2,09,U,2570233


In [6]:
# Creating 'target' column to convert multi-class outcomes to binary-class
ncdb.loc[ncdb['P_ISEV'] == '3', 'target'] = "True"
ncdb.loc[ncdb['P_ISEV'] != '3', 'target'] = "False"

# Creating the train-test split
nc_train, nc_test = train_test_split(ncdb, test_size=0.1, random_state=21)
print("The shape of the national collision train data is:", nc_train.shape)
print("The shape of the national collision train data is:", nc_test.shape)

The shape of the national collision train data is: (260856, 24)
The shape of the national collision train data is: (28985, 24)


In [7]:
# brush = alt.selection_interval()

# EDA on the Training dataset
nc_true = (alt.Chart(
    data=nc_train.query("target=='True'"),
    title={"text": ["Collision by Month"],
           "subtitle": ["Year 2017"]}
).mark_bar(
    opacity=0.5
).encode(
    x=alt.X('C_MNTH',
            title='Month',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count()',
            title='Number of collisions'),
    # color=alt.condition(brush,
    #                     alt.Color('target:N', scale=None),
    #                     alt.value('lightgray'))
    color=alt.Color('target',
                    scale=alt.Scale(scheme='oranges'))
# ).add_selection(brush)
)
)

In [8]:
nc_false = (alt.Chart(
    data=nc_train.query("target=='False'"),
    title={"text": ["Collision by Month"],
           "subtitle": ["Year 2017"]}
).mark_bar(
    color="coral",
    opacity=0.5
).encode(
    x=alt.X('C_MNTH',
            title='Month',
            axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count()',
            title='Number of collisions'),
    color=alt.Color('target',
                    scale=alt.Scale(scheme='magma'))
)
)

In [9]:
alt.layer(nc_false, nc_true).resolve_scale(
    y='independent',
    color='independent'
)