## This is the second notebook in a two-part project on air pollution. 
##### In this notebook, I re-clean the data (just like was done in the previous notebook, "Air Pollution Data Exploration.Ipynb," and will be using this data to visualize on a chloropleth map*  of the US, the level of air pollution by year.
###### *A Chloropleth map is a map which color codes and fills in based on a geographic area.

##### For a written walkthrough visit: https://sites.google.com/berkeley.edu/rudy-venguswamy/home/air-pollution-visualized

In [1]:
import pandas as pd               # Pandas lets us use dataframes
import numpy as np                # Numpy for math operations
import matplotlib.pyplot as plt   # Matplotlib for plotting
import os                         #Get the CSV from my computer
import datetime as dt
import statsmodels.api as sm


In [2]:
#This cell does simply what was done in the previous notebook to ensure repeatability of results even if the other notebook hasn't been opened.
pollution_data = pd.read_csv("C:/Users/Rudy/Desktop/Data/Air Pollution/pollution_us_2000_2016.csv")
pollution_data['DateTime'] = pollution_data.apply(lambda row: dt.datetime.strptime(row['Date Local'], '%Y-%m-%d'), axis = 1)
pollution_data = pollution_data.groupby(['Address','DateTime'], as_index = False).mean()
pollution_data = pollution_data.drop('Unnamed: 0', axis = 1)
pollution_data['Year'] = pollution_data.apply(lambda row: row['DateTime'].year, axis = 1)
pollution_data['Month'] = pollution_data.apply(lambda row: row['DateTime'].month, axis = 1)

In [3]:
#In this cell we turn the US EPA State Codes into numbers that can be mapped to actual states.
pollution_data['State Code'] = pollution_data['State Code'].astype(int)

#Here, we turn all the different AQI levels from NO2, SO2, CO and O3 into a single measure, "pollution level," by summing them up.
pollution_data['Pollution Level'] = pollution_data.apply(lambda row: row['NO2 AQI'] + row['SO2 AQI'] + row['O3 AQI']+ row['CO AQI'], axis = 1)

In [4]:
pollution_data['Pollution Level'].max()
#That's a really high pollution value!

415.0

In [5]:
#We map pollution levels by the US EPA state codes to our file so that it can be interpreted by Plot.ly's chloropleth feature.
pollution_test = pollution_data.copy() #I tested my code for errors and because this operation mutates the DF, I made a copy to work with instead of having to restart the entire kernel.
state_codedf = pd.read_csv("C:/Users/Rudy/Desktop/Data/Air Pollution/states_and_counties.csv")
state_codedf = state_codedf[~state_codedf['State Code'].isin(['CC'])]
state_codedf['State Code'] = state_codedf['State Code'].astype(int)
state_codedf = state_codedf.set_index('State Code')
statecodedict = state_codedf['State Abbreviation'].to_dict()
pollution_test['State Code'] = pollution_test['State Code'].map(statecodedict)

In [6]:
#Now we have our data cleaned, we can group the data by State & Year to build a chloropleth.
pollution_data_by_state = pollution_test.groupby(['State Code', 'Year'], as_index = False).mean()
pollution_data_by_state['Pollution Level'].max()


127.18367346938776

###### Oof, 127! That's a high average pollution level across a year. Now that our data has been cleaned and grouped by state and year, we can create a DF for each year and use that to create the Plot.ly map.

In [8]:
Year = 2012 #Insert the Year you would like to create a map for

#This method creates a dataframe with the average pollution values for each state for a given year.
pollution_data_state_20xx = pollution_data_by_state[pollution_data_by_state.apply(lambda x: x['Year'] == Year, axis=1)]
pollution_data_state_20xx = pollution_data_state_20xx.reset_index(drop = True)
pollution_data_state_20xx = pollution_data_state_20xx[['State Code', 'NO2 AQI', 'SO2 AQI', 'O3 AQI', 'CO AQI','Pollution Level']]
pollution_data_state_20xx.head()

Unnamed: 0,State Code,NO2 AQI,SO2 AQI,O3 AQI,CO AQI,Pollution Level
0,AR,23.756831,2.784153,36.486339,6.45765,69.484973
1,AZ,34.266738,2.391073,40.375133,6.427205,83.460149
2,CA,19.147362,1.560585,35.513922,5.299695,61.516671
3,CO,36.41037,9.157037,34.948148,6.177778,86.693333
4,CT,15.243925,2.059813,38.119626,2.994393,58.417757


In [15]:
#Time to start making the chloropleth
#To make the GIF on my website that shows a time trend, I just did this for every year from 2000- 2016 and strung them together with a GIF maker online. 
import plotly
import plotly.plotly as py
import pandas as pd
plotly.tools.set_credentials_file(username='rudyv', api_key='8fU5oKAmYnJdhB6eBY58') ### Replace this username and key with your Plot.ly credentials

In [16]:
df = pollution_data_state_20xx
for col in df.columns:
    df[col] = df[col].astype(str)

#Setting the color scale for pollution levels
scl = [[0.0, '#FFFFFF'],[0.2, '#F9DBBD'],[0.4, '#FCA17D'],[0.6, '#DA627D'],\
            [0.8, '#9A348E'],[1.0, '#0D0628']]

#On Hover, show info about the breakdown of pollution levels
df['text'] = df['State Code'] + '<br>' +\
    'NO2 Level: '+df['NO2 AQI']+ '<br>'+\
    'SO2 Level: '+df['SO2 AQI']+'<br>'+\
    'O3 Level: '+df['O3 AQI']+'<br>'+\
    'CO Level: '+df['CO AQI']

#Creating the scale for pollution Data
data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df['State Code'],
        z = df['Pollution Level'].astype(float),
        zmin = 0,
        zmax = 143,
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Total Pollution Level (AQI Sum)<br><i>CO, O3, NO2, SO2")
        ) ]

#Constructing actual USA Chloropleth map
layout = dict(
        title = ' US Pollution Levels by State<br><i>Source: EPA.gov Outdoor Air Quality<br>' + str(Year),
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            #landcolor = "rgb(214, 109, 109)",
            lakecolor = 'rgb(255, 255, 255)',
            showland = True,
            landcolor = 'rgb(224, 224, 224)'
            ),
            
             )
#Constructing Figure    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )


Consider using IPython.display.IFrame instead



### Thank you for looking through this notebook!
