# An Exploratory Data Analysis of US Accidents Dataset using Python Visualization Libraries

### The idea is to use various popular libraries - Seaborn, Plotly, Folium, Lux for various data visualization

### In this example we shall do the following
* Use the US Accidents Dataset
* Build a Folium Maps for subset of data
* Look at City level - Use Denver as an example
* Use some of the **Folium** Features such as ****Choropleth**** , ****FeatureGroup****, ****MarkerCluster****
* Look at State Level - Use New York as an example
* Build Seaborn Pairplots
* Build examples using Folium
* Refer to Lux as an example (Pleasse note: This notebook may need to be downloaded to check the lux example as it may not appear in Kaggle output inline)
* Build examples using Plotly


### Standard Kaggle environment settings

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Initial imports and data loading

In [None]:
#Lux may not be installed by default and hence you need to use PIP install
# Uncomment this in your local notebook
#!pip install lux
# Import libraries
import pandas as pd
#import lux # Uncomment this in your local notebook
import warnings
import folium
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#Ignore the import and install warnings
warnings.filterwarnings("ignore")    

### Load the dataset into a Pandas dataframe - This is 1GB+ in size and hence it may take a while
#### Check the dataframe

In [None]:
#Load the dataset - This is 1GB+ in size and hence it may take a while
accidents_db_mas = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv')
accidents_db_mas.head(10)
accidents_db_mas.describe()


## Given that we have millions of rows, let us choose only 100000 rows

In [None]:
accidents_db = accidents_db_mas.iloc[1:100000]

### Following code will work in a Jupyter Notebook - but not in a live display binders like Kaggle or Google CoLab

#### You can play around with Lux options by toggling

In [None]:
accidents_db

### Use Folium to build a map
#### Load the State details
#### Load the standard GEO JSON Data
#### Build a Choropleth using Folium Map

In [None]:

plt.style.use("seaborn")
# Load the shape of the zone (US states)
# Find the original file here: https://github.com/python-visualization/folium/tree/master/examples/data
state_geo = '../input/python-data-visualization-essentials-guide/us-states.json'
state_data = accidents_db 
# Initialize the map:
US_state_map = folium.Map(location=[37, -102], zoom_start=5)
# Add the color for the chloropleth:
US_state_map.choropleth(
 geo_data=state_geo,
 name='choropleth',
 data=state_data,
 columns=['State', 'Severity'],
 key_on='feature.id',
 fill_color='YlGn',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='Accident Severity '
)
folium.LayerControl().add_to(US_state_map)
US_state_map

## Let us correlate and generate a heatmap
#### Use Seaborn.corr() function

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20,20))  

corr=state_data.corr()
print(corr)
sns.heatmap(corr,annot=False,linewidths=.5,cmap="YlGnBu", ax=ax)

#### Let us see a distplot for severity on state data

In [None]:
sns.distplot(state_data['Severity'])

#### Let us see a distplot for other key fields on state data

In [None]:
sns.distplot(state_data['Crossing'])
sns.distplot(state_data['Bump'])
sns.distplot(state_data['Junction'])
sns.distplot(state_data['Roundabout'])
sns.distplot(state_data['Station'])
sns.distplot(state_data['Stop'])
sns.distplot(state_data['Traffic_Calming'])
sns.distplot(state_data['Traffic_Signal'])

## Let us take a city and see how it shapes up. 

### Let us take the Mile High City of Denver

#### Let us take the first 2000 accident data

In [None]:
## get the first 2000 Accidents from the dataset
#co_data = state_data.loc[state_data['State'] == 'CO']
co_data = accidents_db_mas.loc[accidents_db_mas['State'] == 'CO']

limit = 2000
co_accidents = co_data.iloc[0:limit, :]

#### Define the Latitude and Longitude for Denver

In [None]:
# Denver latitude and longitude values
den_lat = 39.7392
den_long = -104.9903

### Display the map of Denver first

In [None]:
# create map and display it
den_map = folium.Map(location=[den_lat, den_long], zoom_start=12)

# display the map of Denver
den_map

### Display the Featurgroup based on accidents dataset (2000) across various Latitude and Longitude

In [None]:
# instantiate a feature group for the accidents in the dataframe
accidents = folium.map.FeatureGroup()

# loop through the 2000 accidents and add to feature group
for lat, lng, in zip(co_accidents.Start_Lat.dropna(), 
                     co_accidents.Start_Lng.dropna()):
    accidents.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='Green',
            fill=True,
            fill_color='Red',
            fill_opacity=0.2
        )
    )

# add incidents to map
den_map.add_child(accidents)

### Display the MarkerCluseter based on accidents dataset (2000) across various Latitude and Longitude grouped on Severity of the accident

In [None]:
from folium import plugins

# let's start again with a clean copy of the map of San Francisco
den_map = folium.Map(location=[den_lat, den_long], zoom_start=12)

# instantiate a mark cluster object for the incidents in the dataframe
accidents = plugins.MarkerCluster().add_to(den_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label in zip(co_accidents.Start_Lat.dropna(), 
                           co_accidents.Start_Lng.dropna(), 
                           co_accidents.Severity.dropna()):
     folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(accidents)

# display map
den_map

### Let us check the stacking of various states based on the severity and the count of values

In [None]:
state_data_sorted= state_data.sort_values('State',ascending=True).reset_index()
state_data_count = state_data.value_counts(state_data['State'])

#print(state_data_count.head(50))
f, ax = plt.subplots(figsize=(20,12)) 

g=sns.barplot(x= 'State', y='Severity', data=state_data_sorted,orient='v')
plt.xticks(rotation=90)
sns.despine(left=True) 

g = sns.factorplot("State", data=state_data_sorted, aspect=2,kind="count", color='steelblue')
plt.xticks(rotation=90)
sns.despine(left=True) 


### Let us collect data for New York state but set the zoom on the New York City

In [None]:
NY_LAT = 40.730610   #40.73° N
NY_LNG = -73.935242  #73.93 W

### Extract the NYC Dataset
#### Code for limiting is commented out. If you want to select specific number of records you can do the same

In [None]:
## get the first 50000 Accidents 
#ny_data = state_data.loc[state_data['State'] == 'NY']
ny_data = accidents_db_mas.loc[accidents_db_mas['State'] == 'NY']
## Using full data sets. if you want to restrict, Comment the next line and uncomment the other two
#ny_accidents = ny_data
limit = 50000
ny_accidents = ny_data.iloc[0:limit, :]

### Let us see the New York map first

In [None]:
# create map and display it
ny_map = folium.Map(location=[NY_LAT, NY_LNG], zoom_start=7)

# display the map of New York
ny_map

### Display the MarkerCluseter based on accidents dataset across various Latitude and Longitude grouped on Severity of the accident

In [None]:
from folium import plugins

# let's start again with a clean copy of the map of San Francisco
ny_map = folium.Map(location=[NY_LAT, NY_LNG], zoom_start=6)

# instantiate a mark cluster object for the incidents in the dataframe
accidents = plugins.MarkerCluster().add_to(ny_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label in zip(ny_accidents.Start_Lat.dropna(), 
                           ny_accidents.Start_Lng.dropna(), 
                           ny_accidents.Severity.dropna()):
     folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(accidents)

# display map
ny_map

### Let us plot PairPlot for the first 10 columns against Severity for New York

In [None]:
plt.figure()
cols_to_plot = ny_accidents.columns[1:10].tolist()##
sns.pairplot(ny_accidents[cols_to_plot], hue ="Severity",  palette ='Accent')
plt.show()

### Let us check for the full dataset

In [None]:
plt.figure()
cols_to_plot = accidents_db.columns[1:10].tolist()##
sns.pairplot(accidents_db[cols_to_plot], hue ="Severity",  palette ='Accent')
plt.show()

### END OF EXECUTION