# A Location Recommender System for Business Startup

In [2]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Toronto data with the postal code

In [3]:
# Loading the Toronto neighborhood data the contains the postal code, latitude, longitude
toronto_df = pd.read_csv('toronto_data_geo.csv')
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
1,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.647177,-79.381576
2,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
3,M9N,York,Weston,43.706876,-79.518188
4,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846


## Get the latitude and longitude of Toronto

In [4]:
address = 'Toronto'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of {} and {}, {}.'.format(address, latitude, longitude))

The geographical coordinate of Toronto and 43.653963, -79.387207.


## Create a map of Toronto with neighborhoods superimposed on top

In [5]:
toronto_map = folium.Map(location = [latitude, longitude], zoom_start=10.3)

for lat, long, borough, neigh in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neigh, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        colors='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

toronto_map

## Focusing on the Borough of `North York` in `Toronto`  and its neighborhood

### Now is time to slice the original dataframe and create a new dataframe of the North York data

In [6]:
northyork = toronto_df[toronto_df['Borough']=='North York']
northyork.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
8,M4A,North York,Victoria Village,43.725882,-79.315572
10,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714
11,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
12,M2R,North York,Willowdale West,43.782736,-79.442259


#### Let get the geographical coordinates of `North York`

In [7]:
address = 'North York'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


#### Lets visualize `North York` and the neighborhoods in it

In [8]:
# create map of North York using latitude and longitude values
northyork_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, long, label in zip(northyork['Latitude'], northyork['Longitude'], northyork['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(northyork_map)  
    
northyork_map

#### Foursquare credentials

In [9]:
CLIENT_ID = 'TF5GWD1JC4MTFSRIJBNHTX225TCKIBO4IBZDAWY20U1KKEV5' # your Foursquare ID
CLIENT_SECRET = '3F0PHOPC4O0R140AMIPJP4FCMEW30EGWSAIQGQO5HEKXHTA0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TF5GWD1JC4MTFSRIJBNHTX225TCKIBO4IBZDAWY20U1KKEV5
CLIENT_SECRET:3F0PHOPC4O0R140AMIPJP4FCMEW30EGWSAIQGQO5HEKXHTA0


In [10]:
def foursquare_crawler (postalcode_list, neigh_list, lat_list, long_list, LIMIT = 500, radius = 1000):
    ny_result = []
    counter = 0
    for postalcode, neigh, lat, long in zip(postalcode_list, neigh_list, lat_list, long_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                                                                                                                    CLIENT_ID, 
                                                                                                                    CLIENT_SECRET, 
                                                                                                                    VERSION, 
                                                                                                                    lat, 
                                                                                                                    long, 
                                                                                                                    radius, 
                                                                                                                    LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        ny_dict = {}
        ny_dict['Postal Code'] = postalcode; ny_dict['Neighborhood(s)'] = neigh; 
        ny_dict['Latitude'] = lat; ny_dict['Longitude'] = long;
        ny_dict['Crawling_result'] = results;
        ny_result.append(ny_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} and Neighborhoods ({}) was SUCCESSFUL.'.format(postalcode, neigh))
    return ny_result;

## Getting internet data from the Foursquare database

### Venues in the Neighborhoods inside `North York`

In [11]:
print('Getting different neighborhoods inside "North York"')
northyork_foursquare_dataset = foursquare_crawler(list(northyork['Postal Code']),
                                                   list(northyork['Neighborhood']),
                                                   list(northyork['Latitude']),
                                                   list(northyork['Longitude']))

Getting different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M2J and Neighborhoods (Fairview, Henry Farm, Oriole) was SUCCESSFUL.
2.
Data is Obtained, for the Postal Code M4A and Neighborhoods (Victoria Village) was SUCCESSFUL.
3.
Data is Obtained, for the Postal Code M2L and Neighborhoods (Silver Hills, York Mills) was SUCCESSFUL.
4.
Data is Obtained, for the Postal Code M3C and Neighborhoods (Flemingdon Park, Don Mills South) was SUCCESSFUL.
5.
Data is Obtained, for the Postal Code M2R and Neighborhoods (Willowdale West) was SUCCESSFUL.
6.
Data is Obtained, for the Postal Code M9M and Neighborhoods (Emery, Humberlea) was SUCCESSFUL.
7.
Data is Obtained, for the Postal Code M2K and Neighborhoods (Bayview Village) was SUCCESSFUL.
8.
Data is Obtained, for the Postal Code M3M and Neighborhoods (Downsview Central) was SUCCESSFUL.
9.
Data is Obtained, for the Postal Code M9L and Neighborhoods (Humber Summit) was SUCCESSFUL.
10.
Data is Obtained, for the Post

## Saving results of Foursquare, so that we would not need to connect every time to Foursquare

In [51]:
import pickle
with open("northyork_foursquare_dataset.txt", "wb") as ny_fq:   #Pickling
    pickle.dump(northyork_foursquare_dataset, ny_fq)
print('Received Data from Internet is Saved to Computer.')

Received Data from Internet is Saved to Computer.


In [52]:
with open('northyork_foursquare_dataset.txt', 'rb') as ny_fq:
    Northyork_foursquare_dataset = pickle.load(ny_fq)
Northyork_foursquare_dataset

[{'Postal Code': 'M2J',
  'Neighborhood(s)': 'Fairview, Henry Farm, Oriole',
  'Latitude': 43.7785175,
  'Longitude': -79.3465557,
  'Crawling_result': [{'reasons': {'count': 0,
     'items': [{'summary': 'This spot is popular',
       'type': 'general',
       'reasonName': 'globalInteractionReason'}]},
    'venue': {'id': '4e848fbb5c5c9240de8e6a80',
     'name': 'The LEGO Store',
     'location': {'address': '1800 Sheppard Ave E',
      'crossStreet': 'at Don Mills Rd',
      'lat': 43.77820727238842,
      'lng': -79.34348299621146,
      'labeledLatLngs': [{'label': 'display',
        'lat': 43.77820727238842,
        'lng': -79.34348299621146}],
      'distance': 249,
      'postalCode': 'M2J 5A7',
      'cc': 'CA',
      'city': 'Toronto',
      'state': 'ON',
      'country': 'Canada',
      'formattedAddress': ['1800 Sheppard Ave E (at Don Mills Rd)',
       'Toronto ON M2J 5A7',
       'Canada']},
     'categories': [{'id': '4bf58dd8d48988d1f3941735',
       'name': 'Toy / Gam

## Let's clean up the raw data that was received from Foursquare Dataset

In [53]:
# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def getVenueDataset(foursquare_dataset):
    df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 'Neighborhood Latitude', 
                                 'Neighborhood Longitude','Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postalcode = neigh_dict['Postal Code']; 
        neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; 
        long = neigh_dict['Longitude']
        print('Number of Venues in Coordinate "{}" Postal Code and "{}" Negihborhood(s) is:'.format(postalcode, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            df = df.append({'Postal Code': postalcode, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':long,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
    
    return(df)

In [54]:
northyork_venues = getVenueDataset(Northyork_foursquare_dataset)

Number of Venues in Coordinate "M2J" Postal Code and "Fairview, Henry Farm, Oriole" Negihborhood(s) is:
43
Number of Venues in Coordinate "M4A" Postal Code and "Victoria Village" Negihborhood(s) is:
12
Number of Venues in Coordinate "M2L" Postal Code and "Silver Hills, York Mills" Negihborhood(s) is:
4
Number of Venues in Coordinate "M3C" Postal Code and "Flemingdon Park, Don Mills South" Negihborhood(s) is:
45
Number of Venues in Coordinate "M2R" Postal Code and "Willowdale West" Negihborhood(s) is:
9
Number of Venues in Coordinate "M9M" Postal Code and "Emery, Humberlea" Negihborhood(s) is:
7
Number of Venues in Coordinate "M2K" Postal Code and "Bayview Village" Negihborhood(s) is:
13
Number of Venues in Coordinate "M3M" Postal Code and "Downsview Central" Negihborhood(s) is:
4
Number of Venues in Coordinate "M9L" Postal Code and "Humber Summit" Negihborhood(s) is:
10
Number of Venues in Coordinate "M5M" Postal Code and "Bedford Park, Lawrence Manor East" Negihborhood(s) is:
42
Numbe

In [55]:
northyork_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,The LEGO Store,This spot is popular,Toy / Game Store,249
1,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,CF Fairview Mall,This spot is popular,Shopping Mall,198
2,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Apple Fairview,This spot is popular,Electronics Store,249
3,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Hero Certified Burgers,This spot is popular,Burger Joint,208
4,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Shoppers Drug Mart,This spot is popular,Pharmacy,154


In [56]:
northyork_venues.shape

(613, 8)

In [57]:
northyork_venues.tail()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
608,M3K,"CFB Toronto, Downsview East",43.737473,-79.464763,TFC Kia Training Ground,This spot is popular,Soccer Field,868
609,M3K,"CFB Toronto, Downsview East",43.737473,-79.464763,Geek Squad,This spot is popular,Other Repair Shop,896
610,M3K,"CFB Toronto, Downsview East",43.737473,-79.464763,Sporttesting,This spot is popular,Gym,958
611,M3K,"CFB Toronto, Downsview East",43.737473,-79.464763,Liberty Love 50,This spot is popular,Business Service,983
612,M3K,"CFB Toronto, Downsview East",43.737473,-79.464763,Downsview Park Ultimate Frisbee,This spot is popular,Athletics & Sports,989


### To check the names of neighborhood

In [58]:
northyork_venues['Neighborhood'].unique()

array(['Fairview, Henry Farm, Oriole', 'Victoria Village',
       'Silver Hills, York Mills', 'Flemingdon Park, Don Mills South',
       'Willowdale West', 'Emery, Humberlea', 'Bayview Village',
       'Downsview Central', 'Humber Summit',
       'Bedford Park, Lawrence Manor East', 'Willowdale South',
       'Don Mills North', 'Downsview, North Park, Upwood Park',
       'Bathurst Manor, Downsview North, Wilson Heights',
       'Northwood Park, York University',
       'Lawrence Heights, Lawrence Manor', 'Downsview Northwest',
       'Glencairn', 'Hillcrest Village', 'Parkwoods', 'York Mills West',
       'Newtonbrook, Willowdale', 'Downsview West',
       'CFB Toronto, Downsview East'], dtype=object)

In [59]:
print('The number of neighborhood in North York is: '+str(len(northyork_venues['Neighborhood'].unique())))

The number of neighborhood in North York is: 24


## Summary of the neighborhood inside `North York`

In [60]:
northyork_summary = northyork_venues.groupby('Neighborhood').count()[['Venue']]
northyork_summary.head(24)

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
"Bathurst Manor, Downsview North, Wilson Heights",26
Bayview Village,13
"Bedford Park, Lawrence Manor East",42
"CFB Toronto, Downsview East",21
Don Mills North,29
Downsview Central,4
Downsview Northwest,31
Downsview West,8
"Downsview, North Park, Upwood Park",12
"Emery, Humberlea",7


In [61]:
print('There are {} nuique venue summary'.format(len(northyork_venues['Venue Category'].unique())))
print('Here is the list of different categories:\n', list(northyork_venues['Venue Category'].unique()))

There are 150 nuique venue summary
Here is the list of different categories:
 ['Toy / Game Store', 'Shopping Mall', 'Electronics Store', 'Burger Joint', 'Pharmacy', 'Bakery', 'Tea Room', 'Movie Theater', 'American Restaurant', 'Candy Store', 'Department Store', 'Salon / Barbershop', 'Juice Bar', 'Coffee Shop', 'Fast Food Restaurant', 'Clothing Store', 'Smoothie Shop', 'Japanese Restaurant', 'Theater', 'Bank', 'Caribbean Restaurant', 'Food Court', 'Restaurant', 'Supermarket', 'Cosmetics Shop', 'Video Game Store', 'Sporting Goods Shop', 'Liquor Store', 'Sandwich Place', 'Beer Store', 'Fried Chicken Joint', 'Pizza Place', 'Hockey Arena', 'Portuguese Restaurant', 'Intersection', 'Park', "Men's Store", 'Lounge', 'Golf Course', 'Athletics & Sports', 'Gym / Fitness Center', 'Pool', 'Discount Store', 'Italian Restaurant', 'History Museum', 'Gym', 'General Entertainment', 'Middle Eastern Restaurant', 'Bike Shop', 'Bar', "Women's Store", 'Grocery Store', 'Ice Cream Shop', 'Dim Sum Restaurant', '

### One-hot encoding the `Categroies` column into every unique categorical feature.

In [62]:
northyork_onehot = pd.get_dummies(data = northyork_venues, drop_first  = False, 
                                  prefix = "", prefix_sep = "", columns = ['Venue Category'])
northyork_onehot.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Beach,Beer Store,Bike Shop,Boutique,Bowling Alley,Breakfast Spot,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Stop,Business Service,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Fireworks Store,Fish & Chips Shop,Food & Drink Shop,Food Court,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Halal Restaurant,Hardware Store,History Museum,Hockey Arena,Hookah Bar,Hot Dog Joint,Hotel,Housing Development,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Laundry Service,Liquor Store,Lounge,Massage Studio,Mediterranean Restaurant,Men's Store,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Optical Shop,Other Repair Shop,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Recreation Center,Rental Car Location,Residential Building (Apartment / Condo),Restaurant,Road,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shipping Store,Shoe Store,Shop & Service,Shopping Mall,Skate Park,Skating Rink,Ski Area,Ski Chalet,Smoothie Shop,Snack Place,Soccer Field,Sporting Goods Shop,Sports Bar,Sports Club,Steakhouse,Storage Facility,Supermarket,Sushi Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,The LEGO Store,This spot is popular,249,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,CF Fairview Mall,This spot is popular,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Apple Fairview,This spot is popular,249,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Hero Certified Burgers,This spot is popular,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,M2J,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,Shoppers Drug Mart,This spot is popular,154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Selecting the groceries related features manually

In [64]:
list_of_features = [
 'Neighborhood',
 'Neighborhood Latitude',
 'Neighborhood Longitude',
 'Accessories Store',
 'American Restaurant',
 'Asian Restaurant',
 'Breakfast Spot',
 'Bubble Tea Shop',
 'Burger Joint',
 'Cafeteria',
 'Caribbean Restaurant',
 'Cheese Shop',
 'Chinese Restaurant',
 'Coffee Shop',
 'Comfort Food Restaurant',
 'Dessert Shop',
 'Dim Sum Restaurant',
 'Eastern European Restaurant',
 'Empanada Restaurant',
 'Falafel Restaurant',
 'Fast Food Restaurant',
 'French Restaurant',
 'Fried Chicken Joint',
 'Greek Restaurant',
 'Grocery Store',
 'Halal Restaurant',
 'Hot Dog Joint',
 'Indian Restaurant',
 'Indonesian Restaurant',
 'Italian Restaurant',
 'Japanese Restaurant',
 'Kitchen Supply Store',
 'Korean Restaurant',
 'Latin American Restaurant',
 'Mediterranean Restaurant',
 'Middle Eastern Restaurant',
 'Portuguese Restaurant',
 'Ramen Restaurant',
 'Restaurant',
 'Salad Place',
 'Sandwich Place',
 'Seafood Restaurant',
 'Sushi Restaurant',
 'Thai Restaurant',
 'Turkish Restaurant',
 'Vietnamese Restaurant']

## Updating the One-hot Encoded DataFrame and
## Grouping the Data by Neighborhoods

In [65]:
northyork_onehot = northyork_onehot[list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()


northyork_onehot.head()

Unnamed: 0_level_0,Accessories Store,American Restaurant,Asian Restaurant,Breakfast Spot,Bubble Tea Shop,Burger Joint,Cafeteria,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Coffee Shop,Comfort Food Restaurant,Dessert Shop,Dim Sum Restaurant,Eastern European Restaurant,Empanada Restaurant,Falafel Restaurant,Fast Food Restaurant,French Restaurant,Fried Chicken Joint,Greek Restaurant,Grocery Store,Halal Restaurant,Hot Dog Joint,Indian Restaurant,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Portuguese Restaurant,Ramen Restaurant,Restaurant,Salad Place,Sandwich Place,Seafood Restaurant,Sushi Restaurant,Thai Restaurant,Turkish Restaurant,Vietnamese Restaurant
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
"Bathurst Manor, Downsview North, Wilson Heights",0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0
Bayview Village,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"Bedford Park, Lawrence Manor East",0,1,0,0,0,0,0,0,0,0,3,1,0,0,0,0,0,3,0,0,1,1,0,0,1,0,3,1,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0
"CFB Toronto, Downsview East",0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,2,1
Don Mills North,0,0,1,1,0,2,1,1,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0


## Integrating Different Restaurants and Different Joints
### (Assuming Different Resaturants Use the Same Raw Groceries)
### This Assumption is made for simplicity and due to not having very large dataset about neighborhoods.

In [66]:
feat_name_list = list(northyork_onehot.columns)
restaurant_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Restaurant') != (-1):
        restaurant_list.append(value)
        
northyork_onehot['Total Restaurants'] = northyork_onehot[restaurant_list].sum(axis = 1)
northyork_onehot = northyork_onehot.drop(columns = restaurant_list)


feat_name_list = list(northyork_onehot.columns)
joint_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Joint') != (-1):
        joint_list.append(value)
        
northyork_onehot['Total Joints'] = northyork_onehot[joint_list].sum(axis = 1)
northyork_onehot = northyork_onehot.drop(columns = joint_list)

In [67]:
northyork_onehot

Unnamed: 0_level_0,Accessories Store,Breakfast Spot,Bubble Tea Shop,Cafeteria,Cheese Shop,Coffee Shop,Dessert Shop,Grocery Store,Kitchen Supply Store,Salad Place,Sandwich Place,Total Restaurants,Total Joints
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Bathurst Manor, Downsview North, Wilson Heights",0,0,0,0,0,2,0,0,0,0,1,4,1
Bayview Village,0,0,0,0,0,0,0,2,0,0,0,4,0
"Bedford Park, Lawrence Manor East",0,0,0,0,0,3,0,1,0,0,1,14,0
"CFB Toronto, Downsview East",0,0,0,0,0,3,0,0,0,0,1,6,0
Don Mills North,0,1,0,1,0,3,0,0,0,1,1,8,2
Downsview Central,0,0,0,0,0,0,0,0,0,0,0,3,0
Downsview Northwest,0,0,0,0,0,2,0,2,1,0,1,7,1
Downsview West,0,0,0,0,0,1,0,1,0,0,0,1,0
"Downsview, North Park, Upwood Park",0,0,0,0,0,3,0,0,0,0,1,3,0
"Emery, Humberlea",0,0,0,0,0,0,0,0,0,0,0,0,0


# Run k-means to Cluster Neighborhoods into 5 Clusters

In [68]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(northyork_onehot)

# Showing Centers of Each Cluster

In [70]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = northyork_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Accessories Store,Breakfast Spot,Bubble Tea Shop,Cafeteria,Cheese Shop,Coffee Shop,Dessert Shop,Grocery Store,Kitchen Supply Store,Salad Place,Sandwich Place,Total Restaurants,Total Joints,Total Sum
G2,0.0,0.0,6.0,0.0,0.0,7.0,2.0,1.0,0.0,0.0,4.0,39.0,2.0,61.0
G5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,1.0,14.5,0.5,20.0
G3,0.333333,0.0,0.0,0.0,0.333333,2.333333,1.0,2.0,0.0,0.0,0.666667,10.333333,1.666667,18.666667
G1,0.0,0.2,0.0,0.2,0.0,3.0,0.0,0.4,0.2,0.2,1.4,7.0,1.0,13.6
G4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.538462,0.0,0.0,0.230769,2.076923,0.076923,3.923077


In [73]:
kmeans.labels_

array([3, 3, 4, 0, 0, 3, 0, 3, 3, 3, 0, 4, 2, 3, 3, 2, 2, 0, 3, 3, 3, 1,
       3, 3])

# Result:
## Best Group is G2;
## Second Best Group is G5;
## Third Best Group is G3;
## Inserting "kmeans.labels_" into the Original `North York` DataFrame
## Finding the Corresponding Group for Each Neighborhood.

In [76]:
neigh_summary = pd.DataFrame([northyork.index, 1+kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood', 'Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Downsview North, Wilson Heights",4
1,Bayview Village,4
2,"Bedford Park, Lawrence Manor East",5
3,"CFB Toronto, Downsview East",1
4,Don Mills North,1
5,Downsview Central,4
6,Downsview Northwest,1
7,Downsview West,4
8,"Downsview, North Park, Upwood Park",4
9,"Emery, Humberlea",4


# Best Neighborhood

In [77]:
neigh_summary[neigh_summary['Group']==2]

Unnamed: 0,Neighborhood,Group
21,Willowdale South,2


# Second Best Neighborhood

In [78]:
neigh_summary[neigh_summary['Group']==5]

Unnamed: 0,Neighborhood,Group
2,"Bedford Park, Lawrence Manor East",5
11,"Flemingdon Park, Don Mills South",5


# Third Best Neighborhood

In [79]:
neigh_summary[neigh_summary['Group']==3]

Unnamed: 0,Neighborhood,Group
12,Glencairn,3
15,"Lawrence Heights, Lawrence Manor",3
16,"Newtonbrook, Willowdale",3
