In [16]:
import datetime
import itertools
import json
import numpy as np
import os
import pandas as pd
from pandas_helpers import *
import pickle
import pprint
import random
import re
import requests
import seaborn
import sys
import time

pd.set_option('display.precision', 20)

DATA_DIR = 'data/citydata/food_vendor'

random.seed(90210)

In [50]:
df = df_orig = pd.read_csv(os.path.join(DATA_DIR, 'CityOfBoston_Active_Food_Establishment_Licenses.csv'),
                           parse_dates=['LicenseAddDtTm'])

In [51]:
df.head()

Unnamed: 0,BusinessName,DBAName,Address,City,State,Zip,LICSTATUS,LICENSECAT,DESCRIPT,LicenseAddDtTm,DAYPHN,Property_ID,Location
0,# 7 RESTAURANT,,225 Grove,West Roxbury,MA,2132,Active,FT,Eating & Drinking w/ Take Out,2007-03-23 15:20:59,10000000000,156965.0,"(42.261873, -71.15741)"
1,129 Lake Street Cafe,,127 LAKE,Brighton/,MA,2135,Active,FS,Eating & Drinking,2010-10-14 12:49:26,16175523307,0.0,"(42.3594, -71.0587)"
2,149 Eat Street,,300 CHELSEA,Charlestown/,MA,2129,Active,FS,Eating & Drinking,2007-10-11 09:08:52,16177262520,0.0,"(42.3594, -71.0587)"
3,163 Vietnamese Sandwich,,66 Harrison,BOSTON,MA,2111,Active,FS,Eating & Drinking,2007-08-09 13:26:45,16175427903,,"(42.3594, -71.0587)"
4,20TH CENTRY BOWLING LANES,,1231 Hyde Park,Hyde Park,MA,2136,Active,FT,Eating & Drinking w/ Take Out,2006-12-07 13:34:04,13392372645,77089.0,"(42.25682, -71.12411)"


In [52]:
df = df[df['LicenseAddDtTm'].dt.year >= 2012]

In [53]:
df.head()

Unnamed: 0,BusinessName,DBAName,Address,City,State,Zip,LICSTATUS,LICENSECAT,DESCRIPT,LicenseAddDtTm,DAYPHN,Property_ID,Location
10,7 Pond Coffee Bar,,597 Centre,Jamaica Plain,MA,2130,Active,FT,Eating & Drinking w/ Take Out,2013-08-15 08:59:40,10000000000,28129.0,"(42.3149, -71.11427)"
13,75 On Liberty Wharf,,220 Northern,Boston,MA,2210,Active,FS,Eating & Drinking,2012-05-18 10:28:35,16178547631,342823.0,"(42.3594, -71.0587)"
16,A @ Time,,417 Cambridge,Allston,MA,2134,Active,FT,Eating & Drinking w/ Take Out,2012-11-19 13:13:38,10000000000,25121.0,"(42.355123, -71.134062)"
21,Abby Lane,,253 Tremont,Boston,MA,2116,Active,FS,Eating & Drinking,2012-05-01 14:10:49,17812548888,137467.0,"(42.35044, -71.06535)"
23,ABIGAIL'S TEA ROOM @CONGRESS BRIDGE,,306 Congress,Boston,MA,2210,Active,FS,Eating & Drinking,2012-01-17 12:47:36,16177371773,,"(42.3594, -71.0587)"


In [54]:
len(df)

556

In [55]:
pd.unique(df['DESCRIPT'])

array(['Eating & Drinking w/ Take Out', 'Eating & Drinking'], dtype=object)

In [56]:
try:
    df.drop(['DBAName', 'Address', 'City', 'State', 'Zip', 'LICSTATUS', 'LICENSECAT', 'DAYPHN', 'Property_ID'],
           axis=1,
           inplace=True)
except ValueError:
    pass

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [57]:
df.head()

Unnamed: 0,BusinessName,DESCRIPT,LicenseAddDtTm,Location
10,7 Pond Coffee Bar,Eating & Drinking w/ Take Out,2013-08-15 08:59:40,"(42.3149, -71.11427)"
13,75 On Liberty Wharf,Eating & Drinking,2012-05-18 10:28:35,"(42.3594, -71.0587)"
16,A @ Time,Eating & Drinking w/ Take Out,2012-11-19 13:13:38,"(42.355123, -71.134062)"
21,Abby Lane,Eating & Drinking,2012-05-01 14:10:49,"(42.35044, -71.06535)"
23,ABIGAIL'S TEA ROOM @CONGRESS BRIDGE,Eating & Drinking,2012-01-17 12:47:36,"(42.3594, -71.0587)"


In [58]:
df = df.drop('BusinessName', axis=1)

In [59]:
df = df.drop('Location', axis=1)

In [60]:
df['date'] = df['LicenseAddDtTm'].map(lambda dt: dt.strftime("%Y-%m-%d"))

In [61]:
df = df.drop('LicenseAddDtTm', axis=1)

In [62]:
df.head()

Unnamed: 0,DESCRIPT,date
10,Eating & Drinking w/ Take Out,2013-08-15
13,Eating & Drinking,2012-05-18
16,Eating & Drinking w/ Take Out,2012-11-19
21,Eating & Drinking,2012-05-01
23,Eating & Drinking,2012-01-17


In [63]:
df = pd.pivot_table(df, index='date', columns='DESCRIPT', aggfunc=len).fillna(0).reset_index()
df.head()

DESCRIPT,date,Eating & Drinking,Eating & Drinking w/ Take Out
0,2012-01-04,0,1
1,2012-01-06,1,2
2,2012-01-11,0,1
3,2012-01-12,1,2
4,2012-01-17,2,3


In [64]:
df[df['date'] == '2013-08-15']

DESCRIPT,date,Eating & Drinking,Eating & Drinking w/ Take Out
228,2013-08-15,3,1


In [69]:
df_orig[(df_orig['LicenseAddDtTm'].dt.year == 2013) &
        (df_orig['LicenseAddDtTm'].dt.month == 8) &
        (df_orig['LicenseAddDtTm'].dt.day == 15)]

Unnamed: 0,BusinessName,DBAName,Address,City,State,Zip,LICSTATUS,LICENSECAT,DESCRIPT,LicenseAddDtTm,DAYPHN,Property_ID,Location
10,7 Pond Coffee Bar,,597 Centre,Jamaica Plain,MA,2130,Active,FT,Eating & Drinking w/ Take Out,2013-08-15 08:59:40,10000000000,28129,"(42.3149, -71.11427)"
1873,PHO SO 1 BOSTON,,223 Adams,Dorchester,MA,2122,Active,FS,Eating & Drinking,2013-08-15 09:23:52,16174368888,156041,"(42.301468, -71.060031)"
1881,Pier 6 Restaurant,,1 Eighth,Charlestown,MA,2129,Active,FS,Eating & Drinking,2013-08-15 08:49:47,10000000000,343036,"(42.373172, -71.050092)"
1902,Pizza Stop,,851 Harrison,Roxbury,MA,2118,Active,FS,Eating & Drinking,2013-08-15 08:19:58,16174274054,70068,"(42.33474, -71.07567)"


In [71]:
df['total'] = df[['Eating & Drinking', 'Eating & Drinking w/ Take Out']].sum(axis=1)

In [73]:
df.head()

DESCRIPT,date,Eating & Drinking,Eating & Drinking w/ Take Out,total
0,2012-01-04,0,1,1
1,2012-01-06,1,2,3
2,2012-01-11,0,1,1
3,2012-01-12,1,2,3
4,2012-01-17,2,3,5


In [75]:
assert df['total'].sum() == 556.0

In [76]:
df = df_rearrange_columns(df,
                          ['date', 'eating_and_drinking', 'eating_and_drinking_with_take_out', 'total'],
                          ['date', 'eating_and_drinking', 'eating_and_drinking_with_take_out', 'total'])
df.head()

Unnamed: 0,date,eating_and_drinking,eating_and_drinking_with_take_out,total
0,2012-01-04,0,1,1
1,2012-01-06,1,2,3
2,2012-01-11,0,1,1
3,2012-01-12,1,2,3
4,2012-01-17,2,3,5


In [81]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].map(lambda x: int((x - datetime.datetime(1970, 1, 1)).total_seconds() * 1000))

In [82]:
df.head()

Unnamed: 0,date,eating_and_drinking,eating_and_drinking_with_take_out,total
0,1325635200000,0,1,1
1,1325808000000,1,2,3
2,1326240000000,0,1,1
3,1326326400000,1,2,3
4,1326758400000,2,3,5


In [89]:
df.columns = ['start_date'] +  df.columns.tolist()[1:]

In [90]:
df.head()

Unnamed: 0,start_date,eating_and_drinking,eating_and_drinking_with_take_out,total
0,1325635200000,0,1,1
1,1325808000000,1,2,3
2,1326240000000,0,1,1
3,1326326400000,1,2,3
4,1326758400000,2,3,5


In [93]:
df = df.astype(int)

In [94]:
df_to_json_split_wo_index(df, 
                          'food_vendor_graph_data.json', 
                          ['Start Date (ms from epoch)',
                           'Vendors licensed as Eating and Drinking',
                           'Vendors licensed as Eating and Drinking with Take Out',
                           'Total Vendors Licensed'])

In [95]:
df['total'].sum()

556

# Food Vendor Description Data

In [169]:
df = df_orig = pd.read_csv(os.path.join(DATA_DIR, 'CityOfBoston_Active_Food_Establishment_Licenses.csv'),
                           usecols=['BusinessName', 'Address', 'City', 'DESCRIPT', 'Location', 'LicenseAddDtTm'],
                           parse_dates=['LicenseAddDtTm'])

In [170]:
df.head()

Unnamed: 0,BusinessName,Address,City,DESCRIPT,LicenseAddDtTm,Location
0,# 7 RESTAURANT,225 Grove,West Roxbury,Eating & Drinking w/ Take Out,2007-03-23 15:20:59,"(42.261873, -71.15741)"
1,129 Lake Street Cafe,127 LAKE,Brighton/,Eating & Drinking,2010-10-14 12:49:26,"(42.3594, -71.0587)"
2,149 Eat Street,300 CHELSEA,Charlestown/,Eating & Drinking,2007-10-11 09:08:52,"(42.3594, -71.0587)"
3,163 Vietnamese Sandwich,66 Harrison,BOSTON,Eating & Drinking,2007-08-09 13:26:45,"(42.3594, -71.0587)"
4,20TH CENTRY BOWLING LANES,1231 Hyde Park,Hyde Park,Eating & Drinking w/ Take Out,2006-12-07 13:34:04,"(42.25682, -71.12411)"


In [171]:
df = df[df['LicenseAddDtTm'].dt.year >= 2012]

In [172]:
df.head()

Unnamed: 0,BusinessName,Address,City,DESCRIPT,LicenseAddDtTm,Location
10,7 Pond Coffee Bar,597 Centre,Jamaica Plain,Eating & Drinking w/ Take Out,2013-08-15 08:59:40,"(42.3149, -71.11427)"
13,75 On Liberty Wharf,220 Northern,Boston,Eating & Drinking,2012-05-18 10:28:35,"(42.3594, -71.0587)"
16,A @ Time,417 Cambridge,Allston,Eating & Drinking w/ Take Out,2012-11-19 13:13:38,"(42.355123, -71.134062)"
21,Abby Lane,253 Tremont,Boston,Eating & Drinking,2012-05-01 14:10:49,"(42.35044, -71.06535)"
23,ABIGAIL'S TEA ROOM @CONGRESS BRIDGE,306 Congress,Boston,Eating & Drinking,2012-01-17 12:47:36,"(42.3594, -71.0587)"


In [173]:
df['description'] = ''

def smush(row):
    desc = '{} [{}, {}]'.format(row['BusinessName'],
                                row['Address'],
                                row['City'])
    
    if row['DESCRIPT'].endswith('Take Out'):
        desc += ' [Takeout]'
    
    row['description'] = desc
    
    return row

df = df.apply(smush, axis=1)

try:
    df.drop(['BusinessName', 'Address', 'City', 'DESCRIPT'], axis=1, inplace=True)
except ValueError:
    pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [174]:
df.head()

Unnamed: 0,LicenseAddDtTm,Location,description
10,2013-08-15 08:59:40,"(42.3149, -71.11427)","7 Pond Coffee Bar [597 Centre, Jamaica Plain] ..."
13,2012-05-18 10:28:35,"(42.3594, -71.0587)","75 On Liberty Wharf [220 Northern, Boston]"
16,2012-11-19 13:13:38,"(42.355123, -71.134062)","A @ Time [417 Cambridge, Allston] [Takeout]"
21,2012-05-01 14:10:49,"(42.35044, -71.06535)","Abby Lane [253 Tremont, Boston]"
23,2012-01-17 12:47:36,"(42.3594, -71.0587)",ABIGAIL'S TEA ROOM @CONGRESS BRIDGE [306 Congr...


In [175]:
df['LicenseAddDtTm'] = df['LicenseAddDtTm'].map(lambda dt: int((dt - datetime.datetime(1970, 1, 1)).total_seconds() * 1000))

In [176]:
df.head()

Unnamed: 0,LicenseAddDtTm,Location,description
10,1376557180000,"(42.3149, -71.11427)","7 Pond Coffee Bar [597 Centre, Jamaica Plain] ..."
13,1337336915000,"(42.3594, -71.0587)","75 On Liberty Wharf [220 Northern, Boston]"
16,1353330818000,"(42.355123, -71.134062)","A @ Time [417 Cambridge, Allston] [Takeout]"
21,1335881449000,"(42.35044, -71.06535)","Abby Lane [253 Tremont, Boston]"
23,1326804456000,"(42.3594, -71.0587)",ABIGAIL'S TEA ROOM @CONGRESS BRIDGE [306 Congr...


In [177]:
def parse_loc(row):
    lat, lon = map(str.split, row['Location'].strip('()').split(','))
    
    row['lat'] = lat[0]
    row['lon'] = lon[0]
    
    return row

df = df.apply(parse_loc, axis=1)

In [178]:
df.head()

Unnamed: 0,LicenseAddDtTm,Location,description,lat,lon
10,1376557180000,"(42.3149, -71.11427)","7 Pond Coffee Bar [597 Centre, Jamaica Plain] ...",42.3149,-71.11427
13,1337336915000,"(42.3594, -71.0587)","75 On Liberty Wharf [220 Northern, Boston]",42.3594,-71.0587
16,1353330818000,"(42.355123, -71.134062)","A @ Time [417 Cambridge, Allston] [Takeout]",42.355123,-71.134062
21,1335881449000,"(42.35044, -71.06535)","Abby Lane [253 Tremont, Boston]",42.35044,-71.06535
23,1326804456000,"(42.3594, -71.0587)",ABIGAIL'S TEA ROOM @CONGRESS BRIDGE [306 Congr...,42.3594,-71.0587


In [179]:
try:
    df.drop('Location', axis=1, inplace=True)
except ValueError:
    pass

In [180]:
df.head()

Unnamed: 0,LicenseAddDtTm,description,lat,lon
10,1376557180000,"7 Pond Coffee Bar [597 Centre, Jamaica Plain] ...",42.3149,-71.11427
13,1337336915000,"75 On Liberty Wharf [220 Northern, Boston]",42.3594,-71.0587
16,1353330818000,"A @ Time [417 Cambridge, Allston] [Takeout]",42.355123,-71.134062
21,1335881449000,"Abby Lane [253 Tremont, Boston]",42.35044,-71.06535
23,1326804456000,ABIGAIL'S TEA ROOM @CONGRESS BRIDGE [306 Congr...,42.3594,-71.0587


In [181]:
df = df_rearrange_columns(df, 
                          ['start_date', 'description', 'latitude', 'longitude'],
                          ['start_date', 'latitude', 'longitude', 'description'])

In [182]:
df.head()

Unnamed: 0,start_date,latitude,longitude,description
10,1376557180000,42.3149,-71.11427,"7 Pond Coffee Bar [597 Centre, Jamaica Plain] ..."
13,1337336915000,42.3594,-71.0587,"75 On Liberty Wharf [220 Northern, Boston]"
16,1353330818000,42.355123,-71.134062,"A @ Time [417 Cambridge, Allston] [Takeout]"
21,1335881449000,42.35044,-71.06535,"Abby Lane [253 Tremont, Boston]"
23,1326804456000,42.3594,-71.0587,ABIGAIL'S TEA ROOM @CONGRESS BRIDGE [306 Congr...


In [183]:
df_to_json_split_wo_index(df, 
                         'food_vendor_data.json',
                         ['Start Date (ms from epoch)',
                          'Latitude of Vendor Location',
                          'Longitude of Vendor Location',
                          'Description of Vendor Name, Location, and Type'])

In [184]:
len(df)

556