# Exploratory notebook for final project. 

In [1]:
# general imports
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shapefile as shp
import json
import requests
import datetime
import warnings 
warnings.filterwarnings("ignore")
# itertools handles the cycling
import itertools 
from pprint import pprint

# bokeh imports
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange,GMapOptions,Dropdown
from bokeh.models import Legend
from bokeh.core.properties import value
from bokeh.transform import factor_cmap
# select a palette
from bokeh.palettes import Spectral3
from bokeh.palettes import Category20b_13 as palette
from bokeh.palettes import Category20b_14 as palette2
from bokeh.transform import dodge
from bokeh.io import reset_output, show
from bokeh.plotting import gmap

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import datasets, metrics, model_selection, svm
from sklearn.metrics import classification_report

sns.set(style='darkgrid', palette='muted', color_codes=True)

# Magic command useful jupyter notebook
%matplotlib inline

#Set plot size.
plt.rcParams['figure.figsize'] = [13,7]

#Set font size
plt.rcParams.update({'font.size':16})

In [2]:
df = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv')

In [3]:
df.columns

Index(['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE',
       'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

In [4]:
df.count()

CAMIS                    402052
DBA                      401681
BORO                     402052
BUILDING                 401788
STREET                   402050
ZIPCODE                  396568
PHONE                    402035
CUISINE DESCRIPTION      402052
INSPECTION DATE          402052
ACTION                   400764
VIOLATION CODE           396492
VIOLATION DESCRIPTION    393097
CRITICAL FLAG            393097
SCORE                    385289
GRADE                    203739
GRADE DATE               201077
RECORD DATE              402052
INSPECTION TYPE          400764
Latitude                 401630
Longitude                401630
Community Board          396148
Council District         396160
Census Tract             396160
BIN                      394401
BBL                      401630
NTA                      396148
dtype: int64

In [5]:
df.isnull().sum()

CAMIS                         0
DBA                         371
BORO                          0
BUILDING                    264
STREET                        2
ZIPCODE                    5484
PHONE                        17
CUISINE DESCRIPTION           0
INSPECTION DATE               0
ACTION                     1288
VIOLATION CODE             5560
VIOLATION DESCRIPTION      8955
CRITICAL FLAG              8955
SCORE                     16763
GRADE                    198313
GRADE DATE               200975
RECORD DATE                   0
INSPECTION TYPE            1288
Latitude                    422
Longitude                   422
Community Board            5904
Council District           5892
Census Tract               5892
BIN                        7651
BBL                         422
NTA                        5904
dtype: int64

In [6]:
df[['GRADE', 'SCORE']].head(50)

Unnamed: 0,GRADE,SCORE
0,A,13.0
1,B,19.0
2,,75.0
3,A,12.0
4,A,11.0
5,,10.0
6,,52.0
7,A,11.0
8,A,7.0
9,A,9.0


In [7]:
df_unique = df[['CAMIS', 'CUISINE DESCRIPTION', 'STREET', 'SCORE', 'Latitude','Longitude', 'DBA', 'BORO']].dropna()

In [8]:
df_loc = df_unique.groupby(['CAMIS', 'CUISINE DESCRIPTION', 'STREET', 'Latitude','Longitude', 'DBA', 'BORO'],as_index=False)['SCORE'].mean()

In [9]:
df_loc

Unnamed: 0,CAMIS,CUISINE DESCRIPTION,STREET,Latitude,Longitude,DBA,BORO,SCORE
0,30075445,Bakery,MORRIS PARK AVE,40.848231,-73.855972,MORRIS PARK BAKE SHOP,Bronx,10.600000
1,30112340,Hamburgers,FLATBUSH AVENUE,40.662652,-73.962081,WENDY'S,Brooklyn,19.809524
2,30191841,Irish,WEST 57 STREET,40.767326,-73.984310,DJ REYNOLDS PUB AND RESTAURANT,Manhattan,10.888889
3,40356018,American,STILLWELL AVENUE,40.579920,-73.982090,RIVIERA CATERERS,Brooklyn,11.125000
4,40356483,Delicatessen,AVENUE U,40.620112,-73.906989,WILKEN'S FINE FOOD,Brooklyn,13.000000
...,...,...,...,...,...,...,...,...
25977,50103853,American,CLARKSON AVE,40.655755,-73.944580,D BUILDING CAFE,Brooklyn,2.000000
25978,50103876,Pizza/Italian,METROPOLITAN AVE,40.712469,-73.895242,FONTANA PIZZERIA,Queens,25.000000
25979,50103991,Pizza,TRINITY PL,40.709494,-74.011813,LAZZARO,Manhattan,2.000000
25980,50104160,Café/Coffee/Tea,ORCHARD ST,40.717976,-73.990296,SUNDAY TO SUNDAY,Manhattan,2.000000


In [10]:
cusines = df_loc['CUISINE DESCRIPTION'].unique()

In [11]:
cusines.tolist()

['Bakery',
 'Hamburgers',
 'Irish',
 'American',
 'Delicatessen',
 'Ice Cream, Gelato, Yogurt, Ices',
 'Hotdogs',
 'Jewish/Kosher',
 'Chinese',
 'Sandwiches/Salads/Mixed Buffet',
 'Caribbean',
 'Donuts',
 'Bagels/Pretzels',
 'Continental',
 'Pizza',
 'Soul Food',
 'Pizza/Italian',
 'Steak',
 'Italian',
 'Polish',
 'Latin (Cuban, Dominican, Puerto Rican, South & Central American)',
 'German',
 'French',
 'Spanish',
 'Café/Coffee/Tea',
 'Seafood',
 'Tex-Mex',
 'Bottled beverages, including water, sodas, juices, etc.',
 'Mexican',
 'Japanese',
 'Greek',
 'Thai',
 'Indian',
 'Mediterranean',
 'Russian',
 'Eastern European',
 'Chicken',
 'Ethiopian',
 'Barbecue',
 'Middle Eastern',
 'Korean',
 'Egyptian',
 'English',
 'Pancakes/Waffles',
 'Other',
 'Chinese/Cuban',
 'Asian',
 'Portuguese',
 'Indonesian',
 'Armenian',
 'Turkish',
 'Moroccan',
 'Hawaiian',
 'Vegetarian',
 'Filipino',
 'Juice, Smoothies, Fruit Salads',
 'Brazilian',
 'Vietnamese/Cambodian/Malaysia',
 'Soups & Sandwiches',
 'Af

In [15]:
cusines = ['Bakery',
 'Hamburgers',
 'Irish',
 'American',
 'Delicatessen',
 'Ice Cream, Gelato, Yogurt, Ices',
 'Hotdogs',
 'Jewish/Kosher',
 'Chinese',
 'Sandwiches/Salads/Mixed Buffet',
 'Caribbean',
 'Donuts',
 'Bagels/Pretzels',
 'Continental',
 'Pizza',
 'Soul Food',
 'Pizza/Italian',
 'Steak',
 'Italian']

In [16]:
circle = {}
items = []
colors = itertools.cycle(palette)

map_options = GMapOptions(lat=40.848231, lng=-73.855972, map_type="roadmap", zoom=11)
reset_output()
output_notebook()
# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:
p = gmap("AIzaSyDpMNrzQZiwpRItWh3j_xrnfpwWO-zN9Dc", map_options, title="New York",plot_width=1000,plot_height=400)

source = ColumnDataSource(df_loc)

for indx,i in enumerate(zip(cusines,colors)):
    circle[i[0]] = p.circle(x="Longitude", y="Latitude", size=7,fill_alpha=0.8 ,source=source,muted_alpha = False, muted = True,color=i[1])
#i stands for a column that we use, top=y; we are specifying that our numbers comes from column i
#read up what legend_label, muted and muted_alpha do... you can add more attributes (you HAVE TO)
    items.append((i[0], [circle[i[0]]])) ### figure where to add it
legend = Legend(items=items, location=(0,10),click_policy="mute") ## figure where to add it
p.add_layout(legend, 'right') ## figure where to add it
### if you read the guide, it will make sense

show(p)



In [208]:
source = ColumnDataSource(df_loc)
colors = itertools.cycle(palette)

hours = [str(elem) for elem in df_hour.index.to_list()]

output_notebook()

p = figure(x_range = FactorRange(factors = hours),width=1000, height=400)


bar ={} # to store vbars
items = [] ### for the custom legend // you need to figure out where to add it
### here we will do a for loop:
for indx,i in enumerate(zip(focuscrimes,colors)):
    bar[i[0]] = p.vbar(x='Time', top=i[0],width = 0.9,source= source,muted_alpha = False, muted = True,color=i[1]) 
#i stands for a column that we use, top=y; we are specifying that our numbers comes from column i
#read up what legend_label, muted and muted_alpha do... you can add more attributes (you HAVE TO)
    items.append((i[0], [bar[i[0]]])) ### figure where to add it
legend = Legend(items=items, location=(0,20),click_policy="mute") ## figure where to add it
p.add_layout(legend, 'right') ## figure where to add it
### if you read the guide, it will make sense

NameError: name 'df_hour' is not defined

In [18]:
for indx,i in enumerate(zip(cusines,colors)):
    print(indx,i)
    print(i[0])

0 ('Bakery', '#843c39')
Bakery
1 ('Hamburgers', '#393b79')
Hamburgers
2 ('Irish', '#5254a3')
Irish
3 ('American', '#6b6ecf')
American
4 ('Delicatessen', '#9c9ede')
Delicatessen
5 ('Ice Cream, Gelato, Yogurt, Ices', '#637939')
Ice Cream, Gelato, Yogurt, Ices
6 ('Hotdogs', '#8ca252')
Hotdogs
7 ('Jewish/Kosher', '#b5cf6b')
Jewish/Kosher
8 ('Chinese', '#cedb9c')
Chinese
9 ('Sandwiches/Salads/Mixed Buffet', '#8c6d31')
Sandwiches/Salads/Mixed Buffet
10 ('Caribbean', '#bd9e39')
Caribbean
11 ('Donuts', '#e7ba52')
Donuts
12 ('Bagels/Pretzels', '#e7cb94')
Bagels/Pretzels
13 ('Continental', '#843c39')
Continental
14 ('Pizza', '#393b79')
Pizza
15 ('Soul Food', '#5254a3')
Soul Food
16 ('Pizza/Italian', '#6b6ecf')
Pizza/Italian
17 ('Steak', '#9c9ede')
Steak
18 ('Italian', '#637939')
Italian


## Api tryout

In [22]:
import requests
import pandas as pd

In [29]:
params = {'city': 'New York', 'per_page': '100'}
r = requests.get('http://opentable.herokuapp.com/api/restaurants',params=params)
json_response = r.json()
json_response

{'total_entries': 1668,
 'per_page': 25,
 'current_page': 1,
 'restaurants': [{'id': 4162,
   'name': 'Orsay',
   'address': '1057-59 Lexington Avenue',
   'city': 'New York',
   'state': 'NY',
   'area': 'New York / Tri-State Area',
   'postal_code': '10021',
   'country': 'US',
   'phone': '2125176400x0',
   'lat': 40.772397,
   'lng': -73.960762,
   'price': 3,
   'reserve_url': 'http://www.opentable.com/single.aspx?rid=4162',
   'mobile_reserve_url': 'http://mobile.opentable.com/opentable/?restId=4162',
   'image_url': 'https://www.opentable.com/img/restimages/4162.jpg'},
  {'id': 24202,
   'name': 'Hill Country - Flatiron',
   'address': '30 West 26th Street',
   'city': 'New York',
   'state': 'NY',
   'area': 'New York / Tri-State Area',
   'postal_code': '10010',
   'country': 'US',
   'phone': '2122554544x10',
   'lat': 40.744163,
   'lng': -73.990471,
   'price': 2,
   'reserve_url': 'http://www.opentable.com/single.aspx?rid=24202',
   'mobile_reserve_url': 'http://mobile.ope

In [28]:
data = pd.DataFrame.from_dict(json_response['restaurants'])
data

Unnamed: 0,id,name,address,city,state,area,postal_code,country,phone,lat,lng,price,reserve_url,mobile_reserve_url,image_url
0,4162,Orsay,1057-59 Lexington Avenue,New York,NY,New York / Tri-State Area,10021,US,2125176400x0,40.772397,-73.960762,3,http://www.opentable.com/single.aspx?rid=4162,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/4162.jpg
1,24202,Hill Country - Flatiron,30 West 26th Street,New York,NY,New York / Tri-State Area,10010,US,2122554544x10,40.744163,-73.990471,2,http://www.opentable.com/single.aspx?rid=24202,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/24202...
2,97870,Bacchanal,146 Bowery,New York,NY,New York / Tri-State Area,10013,US,646-355-1840x,40.719441,-73.994966,2,http://www.opentable.com/single.aspx?rid=97870,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/97870...
3,139675,SKÁL,37 Canal St.,New York,NY,New York / Tri-State Area,10002,US,2127777518,40.714760,-73.990943,3,http://www.opentable.com/single.aspx?rid=139675,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/13967...
4,5656,MINT,150 E. 50th Street,New York,NY,New York / Tri-State Area,10022,US,2126448888,40.756000,-73.972000,3,http://www.opentable.com/single.aspx?rid=5656,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/5656.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,21229,Marc Forgione,134 Reade Street,New York,NY,New York / Tri-State Area,10013,US,2129419401,40.716558,-74.009566,3,http://www.opentable.com/single.aspx?rid=21229,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/21229...
96,115345,Pergola,36 West 28th St,New York,NY,New York / Tri-State Area,10001,US,2126794842x,40.745394,-73.989588,2,http://www.opentable.com/single.aspx?rid=115345,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/11534...
97,10417,Staghorn Steakhouse,315 West 36th Street,New York,NY,New York / Tri-State Area,10018,US,2122394390,40.751695,-73.987770,4,http://www.opentable.com/single.aspx?rid=10417,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/10417...
98,155920,Nagomi,179 Prince St,New York,NY,New York / Tri-State Area,10012,US,2123878230,40.726489,-74.001876,3,http://www.opentable.com/single.aspx?rid=155920,http://mobile.opentable.com/opentable/?restId=...,https://www.opentable.com/img/restimages/15592...


In [31]:
def create_restdata():
    params = {'city': 'New York', 'per_page': '100','page': i}
    r = requests.get('http://opentable.herokuapp.com/api/restaurants',params=params)
    json_response0 = r.json()
    data0 = pd.DataFrame.from_dict(json_response0['restaurants'])
    for i in range(1,18):
        params = {'city': 'New York', 'per_page': '100','page': i}
        r = requests.get('http://opentable.herokuapp.com/api/restaurants',params=params)
        json_response = r.json()
        data = pd.DataFrame.from_dict(json_response['restaurants'])
        data0 = [data0, data]
        data = pd.concat(frames)

    return resut 

In [33]:
create_restdata()