# MY FINAL CAPSTONE PROJECT

## I will compare the cities of New York (USA) and Toronto (CAN) and analyse where we we have potential to open additional vegetarian/vegan restaurants.

**The libraries I am using:**

In [496]:
#!conda install -c conda-forge wikipedia --yes
#!conda install -c conda-forge folium --yes
#!conda install -c conda-forge geopy --yes

import folium
import wikipedia as wp
import numpy as np
import pandas as pd
import requests
from lxml import html
from bs4 import BeautifulSoup
from tabulate import tabulate
import json
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim 

**First I load the data of New York and Toronto and omit duplicates.**

### New York:

In [497]:
!wget -q -O "newyork_data.json" https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
    
with open("newyork_data.json") as json_data:
    ny_data_raw_1 = json.load(json_data)  
    

ny_data_raw_2 = ny_data_raw_1["features"]
ny_df = pd.DataFrame(columns = [ 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'])

for data in ny_data_raw_2:
    borough = ny_df_name = data['properties']['borough'] 
    ny_df_name = data['properties']['name']
        
    ny_df_latlon = data['geometry']['coordinates']
    ny_df_lat = ny_df_latlon[1]
    ny_df_lon = ny_df_latlon[0]
    
    ny_df = ny_df.append({'Borough': borough,
                                          'Neighbourhood': ny_df_name,
                                          'Latitude': ny_df_lat,
                                          'Longitude': ny_df_lon}, ignore_index=True)   

print("The New York data contains {} rows.".format(ny_df.shape[0]))
ny_df.drop_duplicates("Neighbourhood", inplace = True)
print("After removing the duplicates, the New York data contains {} rows.".format(ny_df.shape[0]))

The New York data contains 306 rows.
After removing the duplicates, the New York data contains 302 rows.


### Toronto:

In [498]:
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")

to_df = pd.read_html(html, header = 0)[0]
to_df = to_df[to_df.Borough != "Not assigned"]
to_df = to_df[to_df["Borough"].str.contains("Toronto")].reset_index(drop = True)

In the following cell I download the data for the spatial coordinates of the post codess, but since it accesses a file in my IBM Cloud Object Storage it includes my credentials, which are not to be shown.

In [499]:
# The code was removed by Watson Studio for sharing.

**I again remove duplicates, furthermore I drop the data for the Postcodes, since we do not need it anymore.**

In [500]:
to_df = to_df.join(spatial_data, on = "Postcode")

print("The Toronto data contains {} rows.".format(to_df.shape[0]))
to_df.drop_duplicates("Postcode", inplace = True)
print("After removing the duplicates, the Toronto data contains {} rows.".format(to_df.shape[0]))
to_df.drop(["Postcode"], axis = 1, inplace = True)

The Toronto data contains 74 rows.
After removing the duplicates, the Toronto data contains 39 rows.


### Here I map the neighbourhoods of Toronto

In [501]:
address = 'Toronto, Ontario Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(to_df['Latitude'], to_df['Longitude'], to_df['Borough'], to_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
map_toronto

  app.launch_new_instance()


### Here I map the neighbourhoods of New York

In [502]:
address = 'New York, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(ny_df['Latitude'], ny_df['Longitude'], ny_df['Borough'], ny_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.5,
        parse_html=False).add_to(map_ny)
map_ny

  app.launch_new_instance()


**The next step is importing data from foursquare. The credentials are, again hidden. We will request data from the website to analyse it.**

In [503]:
# The code was removed by Watson Studio for sharing.

In [504]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, return_text = False):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        if return_text == True:
            print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
            
    return(nearby_venues)

In [505]:
# The code was removed by Watson Studio for sharing.

### Here I take the locational data of all the Venus that contain the string "Restaurant" or "Place". Furthermore I divide the dataframe into a total and a vegetarian/vegan dataframe.

In [506]:
to_restaurants = to_venues[to_venues["Venue Category"].str.contains("Restaurant" or "Place")].reset_index(drop = True)
to_restaurants_veg = to_restaurants[to_restaurants["Venue Category"].str.contains("Veg")].reset_index(drop = True)

ny_restaurants = ny_venues[ny_venues["Venue Category"].str.contains("Restaurant" or "Place")].reset_index(drop = True)
ny_restaurants_veg = ny_restaurants[ny_restaurants["Venue Category"].str.contains("Veg")].reset_index(drop = True)

### This is how the data frame for Toronto looks like

In [507]:
to_restaurants_veg

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,St. James Town,43.651494,-79.375418,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
1,Berczy Park,43.644771,-79.373306,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
2,Central Bay Street,43.657952,-79.387383,Vegetarian Haven,43.656016,-79.392758,Vegetarian / Vegan Restaurant
3,Adelaide,43.650571,-79.384568,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant
4,Adelaide,43.650571,-79.384568,Planta Queen,43.650622,-79.388154,Vegetarian / Vegan Restaurant
5,Harbourfront East,43.640816,-79.381752,Kupfert & Kim,43.641179,-79.378144,Vegetarian / Vegan Restaurant
6,Little Portugal,43.647927,-79.41975,The Goods,43.649259,-79.424022,Vegetarian / Vegan Restaurant
7,Design Exchange,43.647177,-79.381576,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant
8,Commerce Court,43.648198,-79.379817,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
9,Commerce Court,43.648198,-79.379817,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant


### In New York we, of course, have more restaurants.

In [508]:
ny_restaurants_veg

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Greenpoint,40.730201,-73.954241,Jungle Cafe,40.730201,-73.954761,Vegetarian / Vegan Restaurant
1,Prospect Heights,40.676822,-73.964859,Natural Blend,40.673819,-73.962923,Vegetarian / Vegan Restaurant
2,Bushwick,40.698116,-73.925258,Hartbreakers,40.701627,-73.922853,Vegetarian / Vegan Restaurant
3,Bushwick,40.698116,-73.925258,Sol Sips,40.699135,-73.92251,Vegetarian / Vegan Restaurant
4,Fort Greene,40.688527,-73.972906,LuAnne's Wild Ginger,40.689644,-73.969824,Vegetarian / Vegan Restaurant
5,Coney Island,40.574293,-73.988683,Marty’s V Burger,40.574932,-73.984353,Vegetarian / Vegan Restaurant
6,Clinton Hill,40.693229,-73.967843,LuAnne's Wild Ginger,40.689644,-73.969824,Vegetarian / Vegan Restaurant
7,East Williamsburg,40.708492,-73.938858,Champs Diner,40.708335,-73.940816,Vegetarian / Vegan Restaurant
8,East Williamsburg,40.708492,-73.938858,Loving Hut,40.712549,-73.941106,Vegetarian / Vegan Restaurant
9,North Side,40.714823,-73.958809,by CHLOE.,40.715372,-73.959595,Vegetarian / Vegan Restaurant


## Now we have all the tools we need to map all the restaurants in blue and vegetarian/vegan restaurants in green.

### First for Toronto:

In [509]:
address = 'Toronto, Ontario Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, venue, neighbourhood in zip(to_restaurants['Venue Latitude'], to_restaurants['Venue Longitude'], to_restaurants['Venue'], to_restaurants['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.2,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, venue, neighbourhood in zip(to_restaurants_veg['Venue Latitude'], to_restaurants_veg['Venue Longitude'], to_restaurants_veg['Venue'], to_restaurants_veg['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green',
        fill=True,
        fill_color='#87face',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)


map_toronto

  app.launch_new_instance()


**In Toronto it is noteworthy that vegetarian/veganrestaurants seem to be concentrated and not spread out in the city, so there is much potential to open up a business!**

In [510]:
address = 'New York, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, venue, neighbourhood in zip(ny_restaurants['Venue Latitude'], ny_restaurants['Venue Longitude'], ny_restaurants['Venue'], ny_restaurants['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.2,
        parse_html=False).add_to(map_ny)
    
for lat, lng, venue, neighbourhood in zip(ny_restaurants_veg['Venue Latitude'], ny_restaurants_veg['Venue Longitude'], ny_restaurants_veg['Venue'], ny_restaurants_veg['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green',
        fill=True,
        fill_color='#87face',
        fill_opacity=0.5,
        parse_html=False).add_to(map_ny)


map_ny

  app.launch_new_instance()


**Manhattan seems to have a high restaurant density in general, but it is moteworthy how many vegetarian/vegan restaurants are in Manhattan. Staten Island has only 1 restaurant, Bronx not a single one and Queens and Brooklyn have just a few, so there much potential for more vegetarian/vegan restaurants.**

**To obtain numerical values di the following steps:**

In [511]:
to_df_analysis_allrest = to_restaurants[["Neighbourhood", "Venue Category"]]
ny_df_analysis_allrest = ny_restaurants[["Neighbourhood", "Venue Category"]]
to_df_analysis_vegrest = to_restaurants_veg[["Neighbourhood", "Venue Category"]]
ny_df_analysis_vegrest = ny_restaurants_veg[["Neighbourhood", "Venue Category"]]

In [512]:
to_rest_tot_data = to_df_analysis_allrest.groupby('Neighbourhood').count().reset_index(level = "Neighbourhood")
ny_rest_tot_data = ny_df_analysis_allrest.groupby('Neighbourhood').count().reset_index(level = "Neighbourhood")
to_rest_veg_data = to_df_analysis_vegrest.groupby('Neighbourhood').count().reset_index(level = "Neighbourhood")
ny_rest_veg_data = ny_df_analysis_vegrest.groupby('Neighbourhood').count().reset_index(level = "Neighbourhood")

### Here is how the dataframes for Toronto look like (first for all the restaurants, second for vegetarian/vegan restaurants:

In [513]:
to_rest_tot_data

Unnamed: 0,Neighbourhood,Venue Category
0,Adelaide,28
1,Berczy Park,11
2,Brockton,3
3,Business Reply Mail Processing Centre 969 Eastern,2
4,Cabbagetown,11
5,Central Bay Street,22
6,Chinatown,27
7,Christie,2
8,Church and Wellesley,28
9,Commerce Court,26


In [514]:
to_rest_veg_data

Unnamed: 0,Neighbourhood,Venue Category
0,Adelaide,2
1,Berczy Park,1
2,Central Bay Street,1
3,Chinatown,3
4,Commerce Court,2
5,Design Exchange,1
6,First Canadian Place,1
7,Harbourfront East,1
8,Little Portugal,1
9,Runnymede,1


### In less than a third of the Neighbourhoods we have a vegetarian/vegan restaurant!

**Now we create dataframes of the amount of total restaurants and vegetarian/vegan restaurants for each neighbourhood, to calculate the fraction of the latter.**

In [515]:
to_veg_list = []
ny_veg_list = []

i = 0
j = 0

while i < len(to_rest_tot_data["Neighbourhood"]):
    
    if j < (len(to_rest_veg_data["Neighbourhood"])):
          
        if to_rest_tot_data["Neighbourhood"][i] == to_rest_veg_data["Neighbourhood"][j]:
        
            to_veg_list.append(to_rest_veg_data["Venue Category"][j])
            j += 1
            i += 1
            
        else:
            
            to_veg_list.append(0)
            i +=1
    
    else:
        
        i+=1        
        to_veg_list.append(0)
        
 
i = 0
j = 0

while i < len(ny_rest_tot_data["Neighbourhood"]):
    
    if j < (len(ny_rest_veg_data["Neighbourhood"])):
          
        if ny_rest_tot_data["Neighbourhood"][i] == ny_rest_veg_data["Neighbourhood"][j]:
        
            ny_veg_list.append(ny_rest_veg_data["Venue Category"][j])
            j += 1
            i += 1
            
        else:
            
            ny_veg_list.append(0)
            i +=1
    
    else:
        
        i+=1        
        ny_veg_list.append(0)

### Here we concatenate the columns and create dataframes that contain all the data. Furthermore we calculate the percentage of vegetarian/vegan restaurants in each district.

In [516]:
to_veg_column = {"Vegetarian / Vegan Restaurants" : to_veg_list}
to_neighbourhood = to_rest_tot_data["Neighbourhood"]
to_num_rest = to_rest_tot_data["Venue Category"]
to_num_veg_rest = pd.DataFrame(to_veg_column)

to_eval_data = pd.concat([to_neighbourhood, to_num_rest, to_num_veg_rest], axis = 1, sort = False).rename(columns = {"Venue Category" : "Total Restaurants"})#.set_index("Neighbourhood")
to_eval_data["Percentage"] = round(to_eval_data["Vegetarian / Vegan Restaurants"] / to_eval_data["Total Restaurants"] * 100, 2)



ny_veg_column = {"Vegetarian / Vegan Restaurants" : ny_veg_list}
ny_neighbourhood = ny_rest_tot_data["Neighbourhood"]
ny_num_rest = ny_rest_tot_data["Venue Category"]
ny_num_veg_rest = pd.DataFrame(ny_veg_column)

ny_eval_data = pd.concat([ny_neighbourhood, ny_num_rest, ny_num_veg_rest], axis = 1, sort = False).rename(columns = {"Venue Category" : "Total Restaurants"})#.set_index("Neighbourhood")
ny_eval_data["Percentage"] = round(ny_eval_data["Vegetarian / Vegan Restaurants"] / ny_eval_data["Total Restaurants"] * 100, 2)

### Here are the best and worst places in Toronto to search for a vegetarian/vegan restaurant:

**Best:**

In [517]:
to_eval_data.sort_values(by = "Percentage", ascending = False).head(10)

Unnamed: 0,Neighbourhood,Total Restaurants,Vegetarian / Vegan Restaurants,Percentage
29,The Annex,4,1,25.0
6,Chinatown,27,3,11.11
24,Runnymede,10,1,10.0
1,Berczy Park,11,1,9.09
9,Commerce Court,26,2,7.69
0,Adelaide,28,2,7.14
18,Harbourfront East,15,1,6.67
20,Little Portugal,18,1,5.56
27,Stn A PO Boxes 25 The Esplanade,19,1,5.26
26,St. James Town,21,1,4.76


**Worst**:

In [518]:
to_eval_data.sort_values(by = "Percentage", ascending = False).tail(10)

Unnamed: 0,Neighbourhood,Total Restaurants,Vegetarian / Vegan Restaurants,Percentage
15,Forest Hill North,1,0,0.0
13,Dovercourt Village,1,0,0.0
11,Deer Park,4,0,0.0
10,Davisville,10,0,0.0
8,Church and Wellesley,28,0,0.0
7,Christie,2,0,0.0
4,Cabbagetown,11,0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,2,0,0.0
2,Brockton,3,0,0.0
31,The Danforth West,15,0,0.0


### Here are the best and worst places in New York to search for a vegetarian/vegan restaurant:

**Best:**

In [519]:
ny_eval_data.sort_values(by = "Percentage", ascending = False).head(10)

Unnamed: 0,Neighbourhood,Total Restaurants,Vegetarian / Vegan Restaurants,Percentage
141,Malba,1,1,100.0
245,Utopia,3,1,33.33
92,Fresh Meadows,3,1,33.33
56,Coney Island,3,1,33.33
219,South Jamaica,3,1,33.33
100,Grasmere,4,1,25.0
71,East Williamsburg,11,2,18.18
193,Queensboro Hill,7,1,14.29
217,Soho,20,2,10.0
31,Bushwick,20,2,10.0


**Worst:**

In [520]:
ny_eval_data.sort_values(by = "Percentage", ascending = False).tail(10)

Unnamed: 0,Neighbourhood,Total Restaurants,Vegetarian / Vegan Restaurants,Percentage
94,Georgetown,5,0,0.0
95,Gerritsen Beach,2,0,0.0
96,Glen Oaks,3,0,0.0
97,Gowanus,14,0,0.0
98,Gramercy,22,0,0.0
99,Grant City,6,0,0.0
101,Gravesend,3,0,0.0
102,Great Kills,7,0,0.0
105,Grymes Hill,1,0,0.0
263,Yorkville,29,0,0.0


### Who of Toronto and New York has now the higher percentage of vegetarian/vegan restaurants?

In [521]:
print("Toronto has {} Vegetarian / Vegan restaurants out of {} total restaurants, which makes up {} %.".format(sum(to_eval_data["Vegetarian / Vegan Restaurants"]), sum(to_eval_data["Total Restaurants"]), 
        round(sum(to_eval_data["Vegetarian / Vegan Restaurants"]) / sum(to_eval_data["Total Restaurants"]) * 100, 2)))

print("New York has {} Vegetarian / Vegan restaurants out of {} total restaurants, which makes up {} %.".format(sum(ny_eval_data["Vegetarian / Vegan Restaurants"]), sum(ny_eval_data["Total Restaurants"]), 
        round(sum(ny_eval_data["Vegetarian / Vegan Restaurants"]) / sum(ny_eval_data["Total Restaurants"]) * 100, 2)))

Toronto has 17 Vegetarian / Vegan restaurants out of 401 total restaurants, which makes up 4.24 %.
New York has 43 Vegetarian / Vegan restaurants out of 2568 total restaurants, which makes up 1.67 %.


# Toronto has a higher percentage of Vegetarian/Vegan restaurants than New York!

### Who of Toronto and New York as a higher density?

In [522]:
to_area = 630 #km^2
ny_area = 790 #km^2
to_pop = 2.62e+6
ny_pop = 8.40e+6

print("Toronto has {} Vegetarian/Vegan restaurants per km$^2$ and {} per million inhabitants.".format(round(sum(to_eval_data["Vegetarian / Vegan Restaurants"])
                                                                                                            / to_area, 2), round(sum(to_eval_data["Vegetarian / Vegan Restaurants"]) / to_pop * 1e+6, 2)))
print("New York has {} Vegetarian/Vegan restaurants per km$^2$ and {} per million inhabitants.".format(round(sum(ny_eval_data["Vegetarian / Vegan Restaurants"])
                                                                                                            / to_area, 2), round(sum(ny_eval_data["Vegetarian / Vegan Restaurants"]) / to_pop * 1e+6, 2)))

Toronto has 0.03 Vegetarian/Vegan restaurants per km$^2$ and 6.49 per million inhabitants.
New York has 0.07 Vegetarian/Vegan restaurants per km$^2$ and 16.41 per million inhabitants.


## New York has a higher vegetarian/vegan restaurant density than Toronto! The lower percentage is due to the fact that New York just has way more restaurants than Toronto!

In [523]:
# The code was removed by Watson Studio for sharing.

'\nplt.style.use("seaborn")\nKs = 6\nmse = np.zeros((Ks-1))\nto_ny_data_clustering = total_eval_data.drop(columns = ["Neighbourhood"])\nfor n in range(1,Ks):\n    \n    # set number of clusters\n    kclusters = n\n    # run k-means clustering\n    kmeans = KMeans(n_clusters=kclusters, random_state=0, init = \'random\', n_init = 10).fit(to_ny_data_clustering)\n    mse[n-1] = kmeans.inertia_\n\nplt.plot(range(1,Ks),mse)\nplt.xlabel("Number of clusters")\nplt.ylabel("MSE")\nplt.title("K selection")\nplt.show()\nnum_klusters = 3\nkmeans = KMeans(n_clusters=num_klusters, random_state=1, init = \'random\', n_init = 15).fit(to_ny_data_clustering)\n    \nto_ny_data_clustering[\'Cluster Labels\'] = kmeans.labels_\n\ntotal_data_clustered = pd.concat([total_eval_data, pd.DataFrame(to_ny_data_clustering["Cluster Labels"])], axis = 1)\ntotal_data_clustered\nto_ny_data_clustering\ntotal_eval_data\nny_df_for_later\ntotal_data_clustered.loc[total_data_clustered[\'Cluster Labels\'] == 0, total_data_clu