## Imports

In [None]:
import sys
!pip install zipcode
!pip install plotly

In [1]:
import pandas as pd
import numpy as np
import folium
import geocoder
import zipcode
import plotly
import plotly.plotly as py
from plotly.graph_objs import *
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap
from us_state_abbrev import us_state_abbrev


## Data

In [2]:
df_beers = pd.read_csv('ucsd-sansdescrip03162018.csv', sep="|")
df_beers = df_beers.rename(index=str, columns={'BeerName': 'name', 'BeerStyleName': 'style', 'Entered': 'entry', 'RateCount': 'rate_count', 'BrewerCity': 'city', 'Abbrev':'state', 'BrewerZIPCode':'zipcode'})

df_beers['name'] = df_beers['name'].str.strip()
df_beers['style'] = df_beers['style'].str.strip()
df_beers['city'] = df_beers['city'].str.strip()
df_beers['state'] = df_beers['state'].str.strip()
df_beers['zipcode'] = df_beers['zipcode'].str.strip()


In [3]:
df_beers.head()

Unnamed: 0,name,style,entry,rate_count,city,state,zipcode
0,Great Adirondack Haystack Blonde Ale,Golden Ale/Blond Ale,2003-11-20 08:38:18.000,18.0,Lake Placid,NY,12946
1,Great Divide Bee Sting Ale,Golden Ale/Blond Ale,2001-06-26 12:31:04.000,111.0,Denver,CO,80205
2,Newport Beach Blonde,Pilsener,2000-05-28 00:30:13.000,24.0,Newport Beach,CA,92663
3,Newport Beach Poohs Reserve Honey Ale,Golden Ale/Blond Ale,2000-05-28 00:31:35.000,1.0,Newport Beach,CA,92663
4,Newport Beach Brewhound Red Ale,American Pale Ale,2000-06-01 01:01:56.000,0.0,Newport Beach,CA,92663


### Adding latitude and longitude 

In [57]:
def get_latitude(z):
    try:
        return zipcode.isequal(z).lat
    except:
        return np.nan

def get_longitude(z):
    try:
        return zipcode.isequal(z).lon
    except:
        return np.nan

In [None]:
# Setting lat/lng columns to beers df
df_beers['lat'] = df_beers.apply(lambda x: get_latitude(x['zipcode']), axis=1)
df_beers['lng'] = df_beers.apply(lambda x: get_longitude(x['zipcode']), axis=1)

In [None]:
assert len([x for x in df_beers['lng'] if np.isnan(x)]) == len([x for x in df_beers['lat'] if np.isnan(x)])

In [None]:
# Removing NaN
df_beers = df_beers[~np.isnan(df_beers['lat'])]

In [58]:
assert len([x for x in df_beers['lat'] if np.isnan(x)]) == 0
assert len([x for x in df_beers['lng'] if np.isnan(x)]) == 0 

### Beers by state

In [4]:
# Counting breweries by state
df_beers_by_state = df_beers.groupby(df_beers['state'].str.strip()).size().reset_index()
df_beers_by_state = df_beers_by_state.rename(index=str, columns={0:'beers_count'})

In [5]:
df_beers_by_state.head()

Unnamed: 0,state,beers_count
0,AK,1830
1,AL,1154
2,AR,490
3,AZ,4136
4,CA,36461


In [6]:
assert sum(df_beers_by_state['beers_count']) == (len(df_beers) - 1) # 

### Beers by city

In [92]:
df_beers_by_city = df_beers.groupby([df_beers['state'].str.upper().str.strip(), df_beers['city'].str.capitalize().str.strip()]).size().reset_index()
df_beers_by_city = df_beers_by_city.rename(index=str, columns={0:'beers_count'})

In [93]:
def get_zipcode(city, state):
    try:
        return df_beers['zipcode'][(state == df_beers['state']) & (city == df_beers['city'])][0]
    except:
        return np.nan

In [94]:
# adding zipcode column
df_beers_by_city['zipcode'] = df_beers_by_city.apply(lambda x: get_zipcode(x['city'], x['state']), axis=1)

In [95]:
# Removing NaN
df_beers_by_city = df_beers_by_city[~pd.isnull(df_beers_by_city['zipcode'])]

In [96]:
# Cleaning zip code so that they are of len 5
df_beers_by_city['zipcode'] = df_beers_by_city.apply(lambda x: x['zipcode'][:5], axis=1)

In [97]:
# Setting lat/lng columns to df_beers_by_city
df_beers_by_city['lat'] = df_beers_by_city.apply(lambda x: get_latitude(x['zipcode']), axis=1)
df_beers_by_city['lng'] = df_beers_by_city.apply(lambda x: get_longitude(x['zipcode']), axis=1)

In [98]:
# Removing NaN
df_beers_by_city = df_beers_by_city[~pd.isnull(df_beers_by_city['zipcode'])]

In [108]:
# Sorting data by beers_count
df_beers_by_city = df_beers_by_city.sort_values(by=['beers_count'], ascending=False)

In [109]:
df_beers_by_city.head()

Unnamed: 0,state,city,beers_count,zipcode,lat,lng,text
422,CO,Denver,5400,80205,39.76,-104.87,Denver<br>Beers: 5400
1962,OR,Portland,5148,97214,45.51,-122.64,Portland<br>Beers: 5148
2580,WA,Seattle,4244,98105,47.66,-122.29,Seattle<br>Beers: 4244
1152,MN,Minneapolis,3178,55454,44.96,-93.26,Minneapolis<br>Beers: 3178
671,FL,Tampa,2504,33635,27.95,-82.48,Tampa<br>Beers: 2504


### Population Estimates (State)

In [79]:
df_population_estimates = pd.read_csv('raw_data/us_states_population_estimates.csv')
df_population_estimates = df_population_estimates.rename(index=str, columns={'State': 'state', 'Population Estimate': 'population_estimate', 'Year': 'year'})

In [80]:
# Getting latest data (population estimates from 2017), dropping it, and removing Puerto Rico
df_population_estimates = df_population_estimates[df_population_estimates['year'] == 2017]
df_population_estimates = df_population_estimates.drop('year', axis=1)
df_population_estimates = df_population_estimates[df_population_estimates['state'] != 'Puerto Rico']

In [81]:
assert len(df_population_estimates) == 51

In [82]:
# Converting full state name to its abbreviation
df_population_estimates['state'] = df_population_estimates['state'].apply((lambda s: us_state_abbrev[s]))

In [83]:
df_population_estimates.head()

Unnamed: 0,state,population_estimate
364,AL,4874747
365,AK,739795
366,AZ,7016270
367,AR,3004279
368,CA,39536653


### Beers per capita by state

In [84]:
# Merging population estimates and beers by state
df_beers_per_capita = pd.merge(df_population_estimates, df_beers_by_state, on='state')

In [85]:
# Beers per capita (1 beer per 100,000 habitants)
df_beers_per_capita['beers_per_capita'] = df_beers_per_capita.apply((lambda r: int((r['beers_count']/r['population_estimate']) * 100000)), axis=1)

In [86]:
# Dropping beer count and population estimates for Map visualization
df_beers_per_capita = df_beers_per_capita.drop('population_estimate', axis=1)
df_beers_per_capita = df_beers_per_capita.drop('beers_count', axis=1)

In [87]:
df_beers_per_capita.head()

Unnamed: 0,state,beers_per_capita
0,AL,23
1,AK,247
2,AZ,58
3,AR,16
4,CA,92


### Beers by cities

In [88]:
df_beers_CA = df_beers[df_beers['state'] == 'CA']
df_beers_VT = df_beers[df_beers['state'] == 'VT']

In [89]:
df_beers_CA.head()

Unnamed: 0,name,style,entry,rate_count,city,state,zipcode,lat,lng
2,Newport Beach Blonde,Pilsener,2000-05-28 00:30:13.000,24.0,Newport Beach,CA,92663,33.62,-117.93
3,Newport Beach Poohs Reserve Honey Ale,Golden Ale/Blond Ale,2000-05-28 00:31:35.000,1.0,Newport Beach,CA,92663,33.62,-117.93
4,Newport Beach Brewhound Red Ale,American Pale Ale,2000-06-01 01:01:56.000,0.0,Newport Beach,CA,92663,33.62,-117.93
5,Newport Beach Rye Grin Pale Ale,American Pale Ale,2000-05-28 00:34:21.000,0.0,Newport Beach,CA,92663,33.62,-117.93
8,Jupiter Pale Ale,American Pale Ale,2001-07-10 20:21:02.000,5.0,Berkeley,CA,94704,37.86,-122.25


In [90]:
df_beers_VT.head()

Unnamed: 0,name,style,entry,rate_count,city,state,zipcode,lat,lng
48,Catamount Pale Ale,American Pale Ale,2001-02-13 21:34:07.000,31.0,White River Junction,VT,5001,43.65,-72.32
125,Catamount Wassail,Spice/Herb/Vegetable,2001-11-16 12:42:07.000,8.0,White River Junction,VT,5001,43.65,-72.32
224,Three Needs West Coast Pale Ale,American Pale Ale,2002-02-06 13:07:12.000,1.0,Burlington,VT,5401,44.48,-73.22
279,Vermont Pub Burly Irish Red,Irish Ale,2000-07-24 22:40:09.000,68.0,Burlington,VT,5401,44.48,-73.22
285,Vermont Pub Curacao Trippel,Abbey Tripel,2001-06-28 21:49:37.000,10.0,Burlington,VT,5401,44.48,-73.22


## Maps & Visualization

### Beers by state (Colorpleth)

In [91]:
state_geo = r'data/us-states.json'

beers_state_map = folium.Map(location=[48, -102], zoom_start=3)
beers_state_map.choropleth(
    geo_data=state_geo,
    data=df_beers_by_state,
    threshold_scale=[6000, 12000, 18000, 24000, 30000, 36000],
    key_on='feature.id',
    columns=['state', 'beers_count'],
    fill_color='OrRd', fill_opacity=1, line_opacity=0.1,
    )

In [None]:
beers_state_map

### Beers per capita by state (Colorpleth)

In [None]:
state_geo = r'data/us-states.json'

beers_per_capita_map = folium.Map(location=[48, -102], zoom_start=3)
beers_per_capita_map.choropleth(
    geo_data=state_geo,
    data=df_beers_per_capita,
    threshold_scale=[100, 200, 300, 400, 500, 600],
    key_on='feature.id',
    columns=['state', 'beers_per_capita'],
    fill_color='OrRd', fill_opacity=1, line_opacity=0.1,
    )

In [None]:
beers_per_capita_map

### Beers by city (HeatMap)

In [None]:
data = df_beers_CA.as_matrix(['lat', 'lng']).tolist()
beers_CA_heatmap = folium.Map(location=[36, -120],tiles='stamentoner', zoom_start=5)
HeatMap(data,radius=14).add_to(beers_CA_heatmap)
beers_CA_heatmap

In [None]:
data = df_beers_VT.as_matrix(['lat', 'lng']).tolist()
beers_VT_heatmap = folium.Map(location=[44, -73],tiles='stamentoner', zoom_start=6)
HeatMap(data,radius=15).add_to(beers_VT_heatmap)
beers_VT_heatmap

In [None]:
data = df_beers_CA.as_matrix(['lat', 'lng']).tolist() + df_beers_VT.as_matrix(['lat', 'lng']).tolist()
beers_CA_VT_heatmap = folium.Map(location=[40, -102],tiles='stamentoner', zoom_start=4)
HeatMap(data,radius=15).add_to(beers_CA_VT_heatmap)
beers_CA_VT_heatmap

### Beers by city (MarkerCluster)

In [None]:
data = df_beers_CA.as_matrix(['lat', 'lng']).tolist()
beers_CA_markermap = folium.Map(location=[37, -120], zoom_start=6)
FastMarkerCluster(data=data).add_to(beers_CA_markermap)
beers_CA_markermap

In [None]:
data = df_beers_VT.as_matrix(['lat', 'lng']).tolist()
beers_VT_markermap = folium.Map(location=[44, -73], zoom_start=7)
FastMarkerCluster(data=data).add_to(beers_VT_markermap)
beers_VT_markermap

In [None]:
data = df_beers_CA.as_matrix(['lat', 'lng']).tolist() + df_beers_VT.as_matrix(['lat', 'lng']).tolist()
beers_markermap = folium.Map(location=[40, -102], zoom_start=4)
FastMarkerCluster(data=data).add_to(beers_markermap)
beers_markermap

### Beers by city (Plotly Bubble Maps)

In [78]:
API_KEY = 'iqJqlRcPKIHqjD1B0DVQ'
plotly.tools.set_credentials_file(username='semendez', api_key=API_KEY)

In [153]:
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
df_beers_by_city['text'] = df_beers_by_city['city'] + '<br>Beers: ' + (df_beers_by_city['beers_count']).astype(str)
limits = [(0,3),(3,10),(10,20),(20,50),(50,2000)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","lightgrey"]
cities = []
scale = 5

In [154]:
for i in range(len(limits)):
    lim = limits[i]
    df_sub = df_beers_by_city[lim[0]:lim[1]]
    city = dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = df_sub['lng'],
        lat = df_sub['lat'],
        text = df_sub['text'],
        marker = dict(
            size = df_sub['beers_count']//scale,
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1]) )
    cities.append(city)

In [155]:
layout = dict(
        title = '2014 US city populations<br>(Click legend to toggle traces)',
        showlegend = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

In [156]:
fig = dict( data=cities, layout=layout )
py.iplot( fig, validate=False, filename='d3-bubble-map-populations', sharing='public')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~semendez/0 or inside your plot.ly account where it is named 'd3-bubble-map-populations'


## Resources

* US States JSON: https://github.com/python-visualization/folium/blob/master/examples/data/us-states.json
* US States Abbreviations: https://gist.githubusercontent.com/rogerallen/1583593/raw/74f2ef57ac82d60dc1e7ae871d1a9c60c9953fac/us_state_abbrev.py
* Population Estimates: https://www.kaggle.com/sheetskg/est-population-us-states-puerto-rico-20102017/data