#### Author: Piaotian Jin
#### Date: June 13, 2019
#### Description: I used K-Means function to cluster neighborhoods of New York City, Toronto and Paris. I used Foursquare location data to extract information about neighborhoods.

#### Data:

The dataset of New York City is downloaded from this website: 
https://geo.nyu.edu/catalog/nyu_2451_34572. This dataset contains the 5 boroughs in New York City and the neighborhoods that exist in each borough as well as the the latitude and logitude coordinates of each neighborhood.

The neighborhoods data of Toronto is scraped from the following Wikipedia page,
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M. Then the neighborhoods data is merged with the geographical coordinates data based on postal code: 
http://cocl.us/Geospatial_data.

The neighborhoods data of Paris is scraped from the following Wikipedia page,
https://en.wikipedia.org/wiki/Arrondissements_of_Paris. Then I used geocoders python package to get the coordinates for each neighborhood.

### 1. Import Libraries

In [28]:
from urllib import request
import bs4 as bs
import json 
import pandas as pd
import numpy as np
import requests
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors

### 2. Import and Format Paris data

In [5]:
# Scrape data from Wikipedia page
PRpage = request.urlopen("https://en.wikipedia.org/wiki/Arrondissements_of_Paris")
PRsoup = bs.BeautifulSoup(PRpage, 'lxml')

# Format data in dataframe
column_names = ['City', 'Neighborhood', 'Latitude', 'Longitude'] 
PR_nbhs = pd.DataFrame(columns=column_names)
for idx, tr in enumerate(PRsoup.find("table", {"class" : "wikitable sortable"}).find_all('tr')):
    if idx != 0:
        tds = tr.find_all('td')
        PR_nbhs = PR_nbhs.append({
            'City': 'Paris',
            'Neighborhood': tds[1].text,
        }, ignore_index=True)

# Get latitude and longtitude
for index, nbh in PR_nbhs.iterrows():
    address = nbh['Neighborhood'] + ', Paris'
    geolocator = Nominatim(user_agent="PRloc")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    PR_nbhs.loc[index, 'Latitude'] = latitude
    PR_nbhs.loc[index, 'Longitude'] = longitude

print(PR_nbhs.shape)
PR_nbhs.head()

(20, 4)


Unnamed: 0,City,Neighborhood,Latitude,Longitude
0,Paris,Louvre,48.8611,2.33803
1,Paris,Bourse,48.8677,2.34312
2,Paris,Temple,48.8627,2.35868
3,Paris,Hôtel-de-Ville,48.8564,2.35253
4,Paris,Panthéon,48.8462,2.34608


### 3. Import and Format Toronto data

In [7]:
# Scrape data from Wikipedia page
TRpage = request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
TRsoup = bs.BeautifulSoup(TRpage, 'lxml')

# Format data in dataframe
AllTR_column_names = ['City', 'PostalCode', 'Borough', 'Neighborhood'] 
AllTR_nbhs = pd.DataFrame(columns=column_names)
for idx, tr in enumerate(TRsoup.find("table", {"class" : "wikitable sortable"}).find_all('tr')):
    if idx != 0:
        tds = tr.find_all('td')
        AllTR_nbhs = AllTR_nbhs.append({
            'City': "Toronto",
            'PostalCode': tds[0].contents[0],
            'Borough': tds[1].text,
            'Neighborhood': tds[2].text.replace("\n", ""),
        }, ignore_index=True)
# Group PostalCode
AllTR_nbhs = AllTR_nbhs[AllTR_nbhs.Borough != 'Not assigned']
postalcodes = AllTR_nbhs.groupby(['City','PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
postalcodes.head()
# Set neighborhood name for NaN value
for index, row in postalcodes.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        neigh = row['Borough']
        postalcodes.at[index, 'Neighborhood'] = neigh

# Read latitude and longitude csv, merge to dataframe
TRlatlong = pd.read_csv('https://cocl.us/Geospatial_data')
TRlatlong.columns = ['PostalCode', 'Latitude', 'Longitude']
postalcodes_latlong = postalcodes.merge(TRlatlong, on = 'PostalCode')
postalcodes_latlong.head()

# Extract neighborhoods contain 'Toronto'
TR_nbhs = postalcodes_latlong[postalcodes_latlong['Borough'].str.contains('Toronto')].reset_index(drop=True)
TR_nbhs = TR_nbhs.drop(['Borough', 'PostalCode'], axis = 1)

print(TR_nbhs.shape)
TR_nbhs.head()

(38, 4)


Unnamed: 0,City,Neighborhood,Latitude,Longitude
0,Toronto,The Beaches,43.676357,-79.293031
1,Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,Toronto,Studio District,43.659526,-79.340923
4,Toronto,Lawrence Park,43.72802,-79.38879


### 4. Import and Format New York data

In [14]:
# Get data
with request.urlopen("https://geo.nyu.edu/download/file/nyu-2451-34572-geojson.json") as url:
    newyork_data = json.loads(url.read().decode())

# Format data in dataframe
NY_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
AllNY_nbhs = pd.DataFrame(columns=column_names)
for data in NY_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]   
    AllNY_nbhs = AllNY_nbhs.append({'Borough': borough,
                                    'Neighborhood': neighborhood_name,
                                    'Latitude': neighborhood_lat,
                                    'Longitude': neighborhood_lon}, ignore_index=True)

# Extract Manhattan data
MH_nbhs = AllNY_nbhs[AllNY_nbhs['Borough'] == 'Manhattan'].reset_index(drop=True)
MH_nbhs.columns = ['City', 'Neighborhood', 'Latitude', 'Longitude']
MH_nbhs['City'] = 'New York'

print(MH_nbhs.shape)
MH_nbhs.head()

(40, 4)


Unnamed: 0,City,Neighborhood,Latitude,Longitude
0,New York,Marble Hill,40.876551,-73.91066
1,New York,Chinatown,40.715618,-73.994279
2,New York,Washington Heights,40.851903,-73.9369
3,New York,Inwood,40.867684,-73.92121
4,New York,Hamilton Heights,40.823604,-73.949688


### 5. Combine Data

In [84]:
nbhs = pd.concat([PR_nbhs,TR_nbhs,MH_nbhs], axis=0, ignore_index=True, sort=False)
print(nbhs.shape)
nbhs.head()

(98, 4)


Unnamed: 0,City,Neighborhood,Latitude,Longitude
0,Paris,Louvre,48.8611,2.33803
1,Paris,Bourse,48.8677,2.34312
2,Paris,Temple,48.8627,2.35868
3,Paris,Hôtel-de-Ville,48.8564,2.35253
4,Paris,Panthéon,48.8462,2.34608


### 6. Extract Venues Information

In [49]:
# define the function to get nearby venues
def getNearbyVenues(names, city, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, city, lat, lng in zip(names, city, latitudes, longitudes):
        print(city, name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(city,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [53]:
LIMIT = 100
venues = getNearbyVenues(names = nbhs['Neighborhood'],
                         city = nbhs['City'],
                         latitudes = nbhs['Latitude'],
                         longitudes = nbhs['Longitude']
                        )

Paris Louvre
Paris Bourse
Paris Temple
Paris Hôtel-de-Ville
Paris Panthéon
Paris Luxembourg
Paris Palais-Bourbon
Paris Élysée
Paris Opéra
Paris Entrepôt
Paris Popincourt
Paris Reuilly
Paris Gobelins
Paris Observatoire
Paris Vaugirard
Paris Passy
Paris Batignolles-Monceau
Paris Butte-Montmartre
Paris Buttes-Chaumont
Paris Ménilmontant
Toronto The Beaches
Toronto The Danforth West,Riverdale
Toronto The Beaches West,India Bazaar
Toronto Studio District
Toronto Lawrence Park
Toronto Davisville North
Toronto North Toronto West
Toronto Davisville
Toronto Moore Park,Summerhill East
Toronto Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Toronto Rosedale
Toronto Cabbagetown,St. James Town
Toronto Church and Wellesley
Toronto Harbourfront,Regent Park
Toronto Ryerson,Garden District
Toronto St. James Town
Toronto Berczy Park
Toronto Central Bay Street
Toronto Adelaide,King,Richmond
Toronto Harbourfront East,Toronto Islands,Union Station
Toronto Design Exchange,Toronto Dominion Cent

In [77]:
# Create dummy variables
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues['Neighborhood'] 
onehot['City'] = venues['City']
# move neighborhood column to the first column
col = onehot.columns.get_loc('Neighborhood')
fixed_columns = [onehot.columns[col]] + list(onehot.columns[:col]) + list(onehot.columns[(col+1):])     
onehot = onehot[fixed_columns]
col2 = onehot.columns.get_loc('City')
fixed_columns2 = [onehot.columns[col2]] + list(onehot.columns[:col2]) + list(onehot.columns[(col2+1):])                                                    
onehot = onehot[fixed_columns2]
# Group by neighborhoods
grouped = onehot.groupby(['City', 'Neighborhood'], sort=False).mean().reset_index()
#print(onehot.shape)
print(grouped.shape)
grouped.head()

(98, 405)


Unnamed: 0,City,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Volleyball Court,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Paris,Louvre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,Paris,Bourse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.06,0.01,0.0,0.02,0.0
2,Paris,Temple,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0
3,Paris,Hôtel-de-Ville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,0.0
4,Paris,Panthéon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0


### 7. Get Top 10 Most Common Venues

In [71]:
# Define the function to get most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [78]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City', 'Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted[['City', 'Neighborhood']] = grouped[['City','Neighborhood']]


for ind in np.arange(grouped.shape[0]):
    venues_sorted.iloc[ind, 2:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

print(venues_sorted.shape)
venues_sorted.head()

(98, 12)


Unnamed: 0,City,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Paris,Louvre,French Restaurant,Café,Hotel,Bar,Coffee Shop,Plaza,Cosmetics Shop,Historic Site,Exhibit,Italian Restaurant
1,Paris,Bourse,French Restaurant,Wine Bar,Cocktail Bar,Bistro,Italian Restaurant,Bar,Hotel,Bakery,Burger Joint,Salad Place
2,Paris,Temple,French Restaurant,Burger Joint,Japanese Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Gourmet Shop,Bakery,Hotel,Sandwich Place
3,Paris,Hôtel-de-Ville,French Restaurant,Ice Cream Shop,Wine Bar,Pastry Shop,Plaza,Cosmetics Shop,Hotel,Cocktail Bar,Art Gallery,Furniture / Home Store
4,Paris,Panthéon,French Restaurant,Hotel,Bar,Indie Movie Theater,Bakery,Italian Restaurant,Café,Ice Cream Shop,Pub,Creperie


### 8. Cluster Neighborhoods

In [75]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

# run k-means clustering
grouped_clustering = grouped.drop(['Neighborhood','City'], 1)
grouped_clustering.fillna(0, inplace=True)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [89]:
# add clustering labels
final_labeled = nbhs
final_labeled.insert(4, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
t = venues_sorted.drop('City',1)
final_labeled = final_labeled.join(t.set_index('Neighborhood'), on='Neighborhood')

print(final_labeled.shape)
final_labeled.head()

(98, 15)


Unnamed: 0,City,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Paris,Louvre,48.8611,2.33803,3,French Restaurant,Café,Hotel,Bar,Coffee Shop,Plaza,Cosmetics Shop,Historic Site,Exhibit,Italian Restaurant
1,Paris,Bourse,48.8677,2.34312,3,French Restaurant,Wine Bar,Cocktail Bar,Bistro,Italian Restaurant,Bar,Hotel,Bakery,Burger Joint,Salad Place
2,Paris,Temple,48.8627,2.35868,3,French Restaurant,Burger Joint,Japanese Restaurant,Coffee Shop,Italian Restaurant,Restaurant,Gourmet Shop,Bakery,Hotel,Sandwich Place
3,Paris,Hôtel-de-Ville,48.8564,2.35253,3,French Restaurant,Ice Cream Shop,Wine Bar,Pastry Shop,Plaza,Cosmetics Shop,Hotel,Cocktail Bar,Art Gallery,Furniture / Home Store
4,Paris,Panthéon,48.8462,2.34608,3,French Restaurant,Hotel,Bar,Indie Movie Theater,Bakery,Italian Restaurant,Café,Ice Cream Shop,Pub,Creperie


### 9. Visualize Results

In [90]:
# Paris
Plat = 48.8566
Plong = 2.3522
map_clusters = folium.Map(location=[Plat, Plong], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_labeled['Latitude'], final_labeled['Longitude'], final_labeled['Neighborhood'], final_labeled['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [91]:
# New York
NYlat = 40.7831
NYlong = -73.9712
map_clusters = folium.Map(location=[NYlat, NYlong], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_labeled['Latitude'], final_labeled['Longitude'], final_labeled['Neighborhood'], final_labeled['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [92]:
# Toronto
TRlat = 43.6532
TRlong = -79.3832
map_clusters = folium.Map(location=[TRlat, TRlong], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_labeled['Latitude'], final_labeled['Longitude'], final_labeled['Neighborhood'], final_labeled['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters