In [None]:
# Import libraries needed
# !pip install geopandas
# !pip install geopy    # Done @25/4
# !conda install -c conda-forge --no-deps folium=0.10.0 --yes
import geopy
from geopy.geocoders import Nominatim
from collections import OrderedDict
import json, requests
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim
# !pip install folium   # Done @25/4
import folium 
# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
# !pip install matplotlib_venn   # Done @25/4
from matplotlib_venn import venn2
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

In [None]:
# New York data to read
url = 'https://cocl.us/new_york_dataset'
ny_data = requests.get(url).json()

# relevant information is in 'features' key
ny_data = ny_data['features']
ny_data[0]

In [None]:
# Store the information from the json file into a dataframe
tmp_columns = ['Borough', 'Neighbourhood', 'Latitude', 'Longitude']
ny_df = pd.DataFrame(columns = tmp_columns)
for data in ny_data:
    borough = data['properties']['borough']
    neigh   = data['properties']['name']
    lat_lon = data['geometry']['coordinates'] # now it'll return list
    lon, lat = lat_lon[0], lat_lon[1]
    
    ny_df = ny_df.append({'Borough': borough,'Neighbourhood': neigh, 'Latitude': lat,
                          'Longitude': lon}, ignore_index=True)
    
ny_df.head()

In [None]:
# Toronto data - Scraping and Cleaning

# Wikipedia url for our Toronto neighborhoods data; an earlier version for the correct data structure
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942851379'
# Beautifulsoup to download html data
req = requests.get(url)
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
neighborhood=pd.DataFrame(df[0])

# Drop "Not Assigned" Neighborhoods
neighborhood['Neighbourhood'].replace('Not assigned', np.nan, inplace=True)
neighborhood.dropna(subset=['Neighbourhood'], inplace=True)
neighborhood.reset_index(drop=True, inplace=True)
# Check our current data
print('Our dataframe has {} rows in total and {} with value "Not Assigned" in the Neighbourhood column'.
      format(neighborhood.shape[0], len(neighborhood[neighborhood['Neighbourhood']=='Not assigned'])))
# Check if there are "Boroughs" with "NA" values, in order to replace them with their respective Neighborhood
print('The column "Borough" has {} rows with the value "Not Assigned"'.
     format(len(neighborhood[neighborhood['Borough']=='Not assigned'])))

# Group our data by Postcode and Borough
neighborhood = neighborhood.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
# Getting the .csv file from the url provided in the lab
geo_coord_url = 'https://cocl.us/Geospatial_data'
geo_coord_data = pd.read_csv(geo_coord_url)

# A bit of manipulation for easier data merge
geo_coord_data.columns = ['Postcode', 'Latitude', 'Longitude']
# Merge of the two tables into a new one and see our result
Toronto_geodata = pd.merge(neighborhood, geo_coord_data, how = 'left', on = 'Postcode')
print("Our combined dataframe's shape is {}".format(Toronto_geodata.shape))
Toronto_geodata.head()

In [None]:
# Foursquare api's credentials and initial values
CLIENT_ID = 'CPTE1KSLDYYKCH4OJIF1FCD1ACUZNQD03KSM2ZKMPNVDDX1V' # your Foursquare ID
CLIENT_SECRET = 'I0LYWHAVD2K1YW2ZRDLTC10DMEV141UGF32HM5QTEXG0GKWG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [None]:
# get_near_by_venues function statement

def get_near_by_venues(names, latitudes, longitudes, radius= 1000):    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'\
        .format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)    

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, 
                             v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],
                             v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue in venues_list for item in venue])
    nearby_venues.columns = ['Neighbourhood','Neighbourhood Latitude', 'Neighbourhood Longitude', 
                             'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return nearby_venues

In [None]:
# explore_borough function statement

def explore_borough(df, borough):
    new_df = df[df['Borough'] == borough].reset_index(drop = True)
    venues =  get_near_by_venues(names = new_df['Neighbourhood'],latitudes = new_df['Latitude'],
                                 longitudes = new_df['Longitude'])

    onehot_df = pd.get_dummies(venues[['Venue Category']], prefix= "", prefix_sep= "")

    # # add neighborhood column back to dataframe
    onehot_df['Neighbourhood'] = venues['Neighbourhood']
    # move neighborhood column to the first column
    fixed_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
    onehot_df = onehot_df[fixed_columns]
    onehot_df_grouped = onehot_df.groupby('Neighbourhood').mean().reset_index()
    
    onehot_coded_df = pd.merge(new_df, onehot_df_grouped, on = 'Neighbourhood', how = 'left')
    
    return onehot_coded_df

In [None]:
# Venues4Boroughs function statement - The Final Piece

def return_venues_for_boroughs(df):
    col = list(df.columns)
    all_venues = pd.DataFrame()
    for i in df['Borough'].unique():
        new_df = df[df['Borough'] == i]
        print('For borough: ',i)
        a = explore_borough(new_df, i)
        all_venues = pd.concat([a, all_venues], axis = 0, ignore_index = True, sort = True)
    cols = col + [j for j in all_venues.columns if j not in col]
    all_venues = all_venues[cols]
    return all_venues

In [None]:
# Dataset for New York

df1 = return_venues_for_boroughs(ny_df)
df1.fillna(0,  inplace = True)
print('Data Shape is: ', df1.shape)
df1.head()

In [None]:
# Now for Toronto

df2 = return_venues_for_boroughs(Toronto_geodata)
df2.fillna(0, inplace = True)
print('Data Shape is: ', df2.shape)
df2.head()

In [None]:
# Making deep copies of our dataframes, will be used later on
ny_df = df1.copy()
to_df = df2.copy().drop('Postcode', 1)

In [None]:
# After extensive trial & fail, we found out that there were inconsistencies in the NYC data. In more detail, there
# were Neighbourhoods that have the same Name but are in different Boroughs. In order to fix this, we'll add 
# the Borough suffix for those Neighbourhoods.

for i in range(ny_df.shape[0]):
    nyn_ = ny_df.loc[i, 'Neighbourhood']
    if ny_df[ny_df['Neighbourhood'] == nyn_].shape[0] > 1:
        ind_ = ny_df[ny_df['Neighbourhood'] == nyn_].index.tolist()
        for j in ind_:
            nyb__ = ny_df.loc[j, 'Borough']
            ny_df.loc[j, 'Neighbourhood'] = nyn_ + ', ' + nyb__

# The code cells that follow is a test case, using Cosine similarity to find out the Nth most similar Neighbourhoods between a Neighbourhood in the other city.
## This may not be the optimal way to go, and this is why we are using this as a test case, to compare between this method and our final one, using K Means Clustering
#### This part may be easily skipped, as it was mainly used because I had the code below ready from another project and thought it worthwhile to take a look at this

In [None]:
# Find the most common venues between the two cities
print('Leaving {} columns of New York city data\nLeaving {} columns of Toronto city data'
      .format(list(df1.columns[0:4]), list(df2.columns[0:5])))
common = 0
diff_in_NY = 0

for i in df1.columns[4:]:
    if i in df2.columns[5:]:
        common += 1
    else:
        diff_in_NY += 1

print('\nNumber of common venue categories in both data are       : {}\n\
Number of different venue categories in New York city are: {}\n\
Number of different venue catehories in Toronto city are : {}'.format(common, diff_in_NY,
                                                                     len(df2.columns[5:])-common))

In [None]:
vn1 = set(df1.columns[4:])
vn2 = set(df2.columns[5:])
plt.figure(figsize = (6,6))
out = venn2([set(vn1), set(vn2)], set_labels = ['New York City', 'Toronto City'], set_colors=('purple', 'skyblue'), alpha = 0.5)
for text in out.set_labels:
    text.set_fontsize(18)
for text in out.subset_labels:
    text.set_fontsize(18)
plt.show()

In [None]:
# Drop non-common venues, for this we will be using the df sets

comm_vns = []
for i in df1.columns[4:]:
    if i in df2.columns[5:]:
        comm_vns.append(i)

col1 = list(df1.columns[0:4]) + comm_vns
col2 = list(df2.columns[0:5]) + comm_vns
print('Before removing non-common venues, shape of New York: {}, and shape of Toronto is: {}'
      .format(df1.shape, df2.shape))
df1 = df1.loc[:, col1]
df2 = df2.loc[:, col2]
print('After removing non-common venues, shape of New York: {}, and shape of Toronto is: {}'
      .format(df1.shape, df2.shape))

In [None]:
# Functions that uses cosine similarity to find N most similar places between NYC and TO, user defined statement

# Function to add numeric suffix
def ret_order_num(n_most_similar):
    a = ['st', 'nd', 'rd']
    c = []
    for i in range(1,20+1):
        if i == 1:
            c.append(str(i)+str(a[i-1]))
        elif i == 2:
            c.append(str(i)+str(a[i-1]))
        elif i == 3:
            c.append(str(i)+str(a[i-1]))
        else:
            c.append(str(i)+'th')
    return c

def most_similar_borough(cur_city, cur_borough, cur_neigh, n_most_similar):
    ny_data = df1.iloc[:,4:]
    toronto_data = df2.iloc[:,5:]
    if cur_city.lower() == 'New York'.lower():
        X1 = ny_data.values
        X2 = toronto_data.values
        index = df1.loc[(df1['Borough'] == cur_borough) 
                        & (df1['Neighbourhood'] == cur_neigh)].index.values.astype(int)[0]
        lat = df1.loc[index]['Latitude']
        lon = df1.loc[index]['Longitude']
    else:
        X1 = toronto_data.values
        X2 = ny_data.values
        index = df2.loc[(df2['Borough'] == cur_borough) 
                & (df2['Neighbourhood'] == cur_neigh)].index.values.astype(int)[0]
        lat = df2.loc[index]['Latitude']
        lon = df2.loc[index]['Longitude']
    
    a = np.matmul(X1[index], X2.T)
    aa = np.argsort(-a)[0:n_most_similar]
    if cur_city.lower() == 'New York'.lower():
        my_brgh = df2.iloc[aa, :]
    else:
        my_brgh = df1.iloc[aa, :]
            
    c = ret_order_num(n_most_similar)
    
    current_location_popup = '{}, {}, {}'.format(cur_neigh, cur_borough, cur_city)
    loclabl = folium.Popup(current_location_popup, parse_html=True)
    
    my_map = folium.Map(location = [lat, lon], zoom_start = 6)
    folium.CircleMarker([lat, lon], color = 'red', radius = 5,
                        popup = loclabl, fill_color = '#3186cc', fill_opacity = 1,
                        fill = True, tooltip = 'current location').add_to(my_map)

    # add markers to map
    for lat, lng, label, priority in zip(my_brgh['Latitude'], my_brgh['Longitude'], 
                                         my_brgh['Neighbourhood'], c):
        label = folium.Popup(label, parse_html=True)
        folium.Marker([lat, lng], radius = 5, popup=label, color='blue', 
                      tooltip = priority, parse_html=False).add_to(my_map)  

    print('Using Cosine Similarity, we found out that the 10 most similar Neighbourhood to:', Cos_Neigh,",", Cos_Borough, 'are the following, \nin descending order:')

    for i in my_brgh:
        cos_top10 = my_brgh[['Borough', 'Neighbourhood']]
    print(cos_top10)
    return my_map

In [None]:
Cos_Neigh = 'East Village'
Cos_Borough = "Manhattan"
Cos_City = 'New York'

most_similar_borough(cur_city = Cos_City, cur_borough = Cos_Borough,
                     cur_neigh = Cos_Neigh, n_most_similar = 10)

# Returning to the main project at hand, the code cells above refer to an alternate methodology, using Cosine Similarity

In [None]:
# Here, we define a function in order to return the first N most common venues
def return_most_common_venues(row, start_col, n_top_cat):
    row_categories = row.iloc[start_col:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:n_top_cat]

In [None]:
# create columns according to number of top venues
n_top_cat = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighbourhood']

for ind in np.arange(n_top_cat):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
nyc_top_cat = pd.DataFrame(columns=columns)
nyc_top_cat['Neighbourhood'] = ny_df['Neighbourhood']

for ind in np.arange(ny_df.shape[0]):
    nyc_top_cat.iloc[ind, 1:] = return_most_common_venues(ny_df.iloc[ind, :], 3, n_top_cat)

nyc_top_cat.head()

## Clustering NYC
Now we apply K-Means Clustering for our dataframe for NYC, which includes the relative frequency of each venue per neighbourhood

In [None]:
# set number of clusters
kclusters = 5

nyc_clust = ny_df.drop(ny_df.iloc[:, 0:4], inplace = True, axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyc_clust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
# ny_df.insert(0, 'Cluster Labels', kmeans.labels_)

nyc_merged = ny_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
nyc_merged = nyc_merged.join(nyc_top_cat.set_index('Neighbourhood'), on='Neighbourhood')

nyc_merged.head()

In [None]:
ny_df.head()

In [None]:
df1.head()