<a href="https://colab.research.google.com/github/U-Power/Coursera_Capstone/blob/main/TorontoNeighborhoods1.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Segmenting and Clustering Neighborhoods in Toronto**

## Getting Toronto information on the web and scraping the url page

In [None]:
# import the beautiful soup package to scrape the web page
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
# create a bs4 object
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(r.text, 'html.parser')
soup.prettify();

In [None]:
# from a closer look you can find the lines containing the three informations needed for each PostalCode (TAG 'p')
soup.tbody.find_all('p');

In [None]:
# extract the row string containing the 3 metrics needed
row_data = [element.text for element in soup.tbody.find_all('p')]
row_data[0:5]

['M1ANot assigned\n',
 'M2ANot assigned\n',
 'M3ANorth York(Parkwoods)\n',
 'M4ANorth York(Victoria Village)\n',
 'M5ADowntown Toronto(Regent Park / Harbourfront)\n']

In [None]:
# extract the postal codes, getting the first 3 digits of each string. Add them to a list
pc = [];
for i in range(len(row_data)):
  pc.append(row_data[i][0:3])
pc[0:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [None]:
# extract the rest of data and then split it, adding the first element of the split to the Borough list
borough = [];
for i in range(len(row_data)):
  borough.append(row_data[i][3:].split('(')[0].split('\n')[0])
borough[0:5]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [None]:
# extract the rest of data and then split it, adding the second element of the split to the Neighborhoods list (add 'Not assigned', first element if there is not a second one)
neighb = [];
for i in range(len(row_data)):
  try: neighb.append(row_data[i][3:].split('(')[1].split(')')[0].strip().replace(' / ',', ')) 
  except: neighb.append(row_data[i][3:].split('(')[0].split('\n')[0])

## Create a pandas DataFrame and cleanse the data

In [None]:
# create a dictionary to store the 3 columns needed in the dataFrame
data = {}
data['PostalCode'] = pc
data['Borough'] = borough
data['Neighborhood'] = neighb

#create a DataFrame
df = pd.DataFrame(data)
df[0:5]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [None]:
# assign NaN values to 'Not assigned' rows and drop them
df.Borough.replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# take a look at the data grouping by 'Borough' and adjust the rows labels
df.groupby('Borough').describe()

Unnamed: 0_level_0,PostalCode,PostalCode,PostalCode,PostalCode,Neighborhood,Neighborhood,Neighborhood,Neighborhood
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Borough,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Central Toronto,9,9,M5R,1,9,9,North Toronto West,1
Downtown Toronto,17,17,M5S,1,17,17,Central Bay Street,1
Downtown TorontoStn A PO Boxes25 The Esplanade,1,1,M5W,1,1,1,Enclave of M5E,1
East Toronto,4,4,M4L,1,4,4,"India Bazaar, The Beaches West",1
East TorontoBusiness reply mail Processing Centre969 Eastern,1,1,M7Y,1,1,1,Enclave of M4L,1
East York,4,4,M4H,1,4,4,Thorncliffe Park,1
East YorkEast Toronto,1,1,M4J,1,1,1,The Danforth East,1
Etobicoke,11,11,M8W,1,11,11,"West Deane Park, Princess Gardens, Martin Grov...",1
EtobicokeNorthwest,1,1,M9W,1,1,1,"Clairville, Humberwood, Woodbine Downs, West H...",1
MississaugaCanada Post Gateway Processing Centre,1,1,M7R,1,1,1,Enclave of L4W,1


In [None]:
# adjust the label of some borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [None]:
print('The dimensions of the Data Frame are:', df.shape)

The dimensions of the Data Frame are: (103, 3)


### It is possible to add more rows for every Neighborhood (to make a better analysis)

In [None]:
# try to duplicate the rows which have multiple Neighborhood values in order to map them separately
new_df = pd.DataFrame(df.Neighborhood.str.split(', ').tolist(), index=df.PostalCode).stack()
new_df = new_df.reset_index([0, 'PostalCode'])
new_df.columns = ['PostalCode', 'Neighborhood']
new_df.head(10)

In [None]:
# create a separate df which links postal codes to boroughs
df_bor = df.drop('Neighborhood', axis=1)
df_bor

In [None]:
# merge the dataFrames to have a complete single row for each neighborhood with the borough assigned
df_adj = new_df.merge(df_bor, how='inner')
df_adj

# Use the geopy Python library to get Neighborhood coordinates
**In order to simplify you can drop this section and use the CSV provided instead.**
**It is suggested since getting the coordinates for every Neighborhood does not give additional informations compared with using postal codes. Also, it takes 2-3 min to run the cycle for all the Neighborhoods**

In [None]:
# import the libraries
!pip install geopy # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [None]:
# get the coordinates for every neighborhood in Toronto and append them to the dataFrame (it takes 2-3 min)
latitudes = [];
longitudes = [];
 
for i in range(len(df_adj['Neighborhood'])):
  try:
    address = 'Toronto, Ontario, ' + df_adj['Neighborhood'][i]
    geolocator = Nominatim(user_agent="toronto_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    latitudes.append(latitude)
    longitudes.append(longitude)
  except:
    latitudes.append('NaN')
    longitudes.append('NaN')


df_fin = df_adj
df_fin['Latitude'] = latitudes
df_fin['Longitude'] = longitudes

In [None]:
# pay attention at the unique coordinates values we get for every Neighborhood
# it can be compared with the unique values we get from the csv provided
df_fin.describe()

In [None]:
# the NaN rows can be eliminated in order to ease the next part of the 4square API 
df_fin = df_fin[df_fin.Latitude != 'NaN']
df_fin.describe()

In [None]:
# get the csv file, put it in a dataFrame
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
latlong_df = pd.read_csv(url)
latlong_df.head()

In [None]:
# notice that the unique coordinates values in the CSV and in the dataframe built with the API
print('The number of neighborhood coord for the CSV: {} \nThe number of neighborhood coord fetched with the API: {} \nSo we can make a better analysis using the data fetched from the API'.format(latlong_df['Latitude'].nunique(), df_fin['Latitude'].nunique()))