# Project Clustering and segmentation of the neighbourhood in the city of Toranto  

### Part 1 To import ,webscrape and obtain  dataframe

In [3]:
import pandas as pd


In [4]:
wiki_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
df= pd.read_html(wiki_link)[0] #reading wikipedia page into dataframe

In [9]:
df.columns=['PostalCode','Borough','Neighborhood'] #required columns
df=df.iloc[1:]
#cleaning data frame
df = df[df['Borough']!= 'Not assigned']
index = df.loc[df['Neighborhood']=='Not assigned'].index.values
df.loc[index, 'Neighborhood'] = df.loc[index, 'Borough']
series = df.groupby(['PostalCode']).apply(lambda x: ', '.join(x['Neighborhood'])) #groupby and concatinate strings
df2 = series.to_frame().reset_index()
df2.columns = ['PostalCode', 'Neighborhood']
df = pd.merge(df[['PostalCode','Borough']], df2, on='PostalCode', how='right')
df = df.drop_duplicates(['PostalCode'])  

df = df.reset_index(drop=True)
df.shape

(102, 3)

In [12]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4A,North York,Victoria Village
1,M5A,Downtown Toronto,"Regent Park, Harbourfront"
2,M6A,North York,"Lawrence Manor, Lawrence Heights"
3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
4,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
5,M1B,Scarborough,"Malvern, Rouge"
6,M3B,North York,Don Mills
7,M4B,East York,"Parkview Hill, Woodbine Gardens"
8,M5B,Downtown Toronto,"Garden District, Ryerson"
9,M6B,North York,Glencairn


###  PART 2 : importing geocoder inorder to obtain the latitude and longitudnal values

In [13]:
! pip install --user geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 6.7MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [15]:
import geocoder # import geocoder

latitude = []
longitude = []

for borough in df['Borough']:
    g = geocoder.osm('{}, Toronto, Ontario'.format(borough))
    latitude.append(g.osm['y'])
    longitude.append(g.osm['x'])
    
df3 = pd.DataFrame({'PostalCode': df['PostalCode'], 'Latitude': latitude}) 
df4 = pd.DataFrame({'PostalCode': df['PostalCode'], 'Longitude': longitude})
df_new = pd.merge(df, df3, on = 'PostalCode')
df_new = pd.merge(df_new, df4, on = 'PostalCode')
df_new.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4A,North York,Victoria Village,43.754326,-79.449117
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.656322,-79.380916
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.754326,-79.449117
3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.656322,-79.380916
4,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.643556,-79.565633
5,M1B,Scarborough,"Malvern, Rouge",43.773077,-79.257774
6,M3B,North York,Don Mills,43.754326,-79.449117
7,M4B,East York,"Parkview Hill, Woodbine Gardens",43.699971,-79.33252
8,M5B,Downtown Toronto,"Garden District, Ryerson",43.656322,-79.380916
9,M6B,North York,Glencairn,43.754326,-79.449117


### Part 3: Explore , clustering and analysis

In [16]:
import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files
import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [17]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0          conda-forge
    geopy:           

In [20]:
#parameter for foursquare-api

 #CLIENT_ID =
 #CLIENT_SECRET =
 VERSION ='20200718'
 LIMIT =30

In [27]:
radius = 250
n = len(latitude)
filtered_columns = ['PostalCode', 'Borough', 'venue.id', 'venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
df_new2 = pd.DataFrame([], columns = filtered_columns)
for i in range(0, n):
    lati = latitude[i]
    long = longitude[i]
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lati, long, VERSION, radius, LIMIT)
    results = requests.get(url).json()
    
    try:
        items = results['response']['groups'][0]['items']
        dataframe = json_normalize(items)
        m = len(dataframe)
        dataframe['PostalCode'] = pd.DataFrame({'PostalCode': [df_new.loc[i, 'PostalCode']] * m})
        dataframe['Borough'] = pd.DataFrame({'Borough': [df_new.loc[i, 'Borough']] * m})
        dataframe['venue.categories'] = dataframe['venue.categories'].apply(lambda x: x[0]['name'])
        dataframe_fil = dataframe.loc[:, filtered_columns]
        df_new2 = df_new2.append(dataframe_fil)
    except KeyError:
        pass

TypeError: object of type 'float' has no len()

In [28]:
df_new2.head(12)

Unnamed: 0,PostalCode,Borough,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,M4A,North York,5a888f7647f8767d37b92f00,Grill Gate,Mediterranean Restaurant,43.753123,-79.45169
1,M4A,North York,4ee8d855108135b4c8585446,Crave Restaurant,Wings Joint,43.753133,-79.450378
2,M4A,North York,50f9bbcc5d24acebc25936af,Domino's Pizza,Pizza Place,43.753127,-79.450926
0,M5A,Downtown Toronto,57eda381498ebe0e6ef40972,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641
1,M5A,Downtown Toronto,5215491b11d2e36439600424,DAVIDsTEA,Tea Room,43.656973,-79.38104
2,M5A,Downtown Toronto,4ad4c062f964a520c5f720e3,Ed Mirvish Theatre,Theater,43.655102,-79.379768
3,M5A,Downtown Toronto,5615b6c4498e3c32c67ad78f,Blaze Pizza,Pizza Place,43.656518,-79.380015
4,M5A,Downtown Toronto,4ad77a12f964a520260b21e3,CF Toronto Eaton Centre,Shopping Mall,43.65454,-79.380677
5,M5A,Downtown Toronto,4ad4c062f964a5200bf820e3,Silver Snail Comics,Comic Shop,43.657031,-79.381403
6,M5A,Downtown Toronto,514cc159e4b0e4f73af4eced,Jazz Bistro,Music Venue,43.655678,-79.379276


In [29]:
countCategoriesPerBorough = df_new2.groupby(['Borough','venue.categories']).count()['PostalCode'].reset_index()

list_borough = countCategoriesPerBorough['Borough'].unique()
y = pd.DataFrame(list_borough, columns = ['Borough'])

list_venueCategories = countCategoriesPerBorough['venue.categories'].unique()
df_allVenueCategories = pd.DataFrame(list_venueCategories, columns = ['venue.categories'])

X = pd.DataFrame([])

for borough in list_borough:
    df_CategoriesPerBorough = countCategoriesPerBorough[countCategoriesPerBorough['Borough'] == borough][['venue.categories','PostalCode']]
    df_CategoriesPerBorough = df_CategoriesPerBorough.append(pd.DataFrame({'venue.categories': list(set(df_allVenueCategories['venue.categories'])-set(df_CategoriesPerBorough['venue.categories'])), 'PostalCode': [0] * len(list(set(df_allVenueCategories['venue.categories'])-set(df_CategoriesPerBorough['venue.categories'])))})).reset_index(drop=True)
    df_mapping = pd.merge(df_allVenueCategories, df_CategoriesPerBorough, on = 'venue.categories', how = 'left')
    X = X.append(df_mapping.T.iloc[1])
    
X = X.reset_index(drop = True)
X.columns = list_venueCategories
X

Unnamed: 0,Bakery,Beer Bar,Café,Cocktail Bar,Coffee Shop,Fried Chicken Joint,Gym,Gym / Fitness Center,Hotel,Hotel Bar,...,Bank,Bookstore,Bus Station,Cosmetics Shop,Food Court,Greek Restaurant,Sandwich Place,Video Game Store,Museum,Skating Rink
0,6.0,6.0,6.0,6.0,60.0,6.0,6.0,6.0,12.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.0,0.0,11.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,0.0,0.0
7,5.0,5.0,5.0,5.0,50.0,5.0,5.0,5.0,10.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0


In [30]:
k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)
k_means.fit(X)

k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_

yWithClusterNumber = pd.concat([y.reset_index(drop=True), pd.DataFrame(k_means_labels.tolist(), columns = ["Cluster number"])], axis=1)
yWithClusterNumber = pd.merge(df_new[['Borough', 'Latitude', 'Longitude']], yWithClusterNumber, on='Borough', how='right').groupby(['Borough']).mean().reset_index()

yWithClusterNumber

Unnamed: 0,Borough,Latitude,Longitude,Cluster number
0,Central Toronto,43.644903,-79.381836,1
1,Downtown Toronto,43.656322,-79.380916,2
2,East Toronto,43.62479,-79.393492,0
3,East York,43.699971,-79.33252,0
4,Etobicoke,43.643556,-79.565633,0
5,North York,43.754326,-79.449117,0
6,Scarborough,43.773077,-79.257774,3
7,West Toronto,43.644903,-79.381836,1
8,York,43.689619,-79.479188,0


In [25]:
address = 'Toronto, Ontario'

g = geocoder.osm(address)
latitude = g.osm['y']
longitude = g.osm['x']

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
n = len(yWithClusterNumber)

for i in range(n):
    borough = yWithClusterNumber.iloc[i]
    
    if borough["Cluster number"] == 0:
        color = 'red'
    elif borough["Cluster number"] == 1:
        color = 'blue'
    elif borough["Cluster number"] == 2:
        color = 'yellow'
    elif borough["Cluster number"] == 3:
        color = 'green'
    
    label = '{}'.format(borough["Borough"])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [borough["Latitude"], borough["Longitude"]],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
# display map
display(map_toronto)
map_toronto.save("toronto_map.html")