<a href="https://colab.research.google.com/github/U-Power/Coursera_Capstone/blob/main/TorontoNeighborhoods1.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Segmenting and Clustering Neighborhoods in Toronto**

## Getting Toronto information on the web and scraping the url page

In [1]:
# import the beautiful soup package to scrape the web page
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(r.text, 'html.parser')
soup.prettify();

In [3]:
# from a closer look you can find the lines containing the three informations needed for each PostalCode (TAG 'p')
soup.tbody.find_all('p');

In [4]:
# extract the row string containing the 3 metrics needed

row_data = [element.text for element in soup.tbody.find_all('p')]
row_data[0:5]

['M1ANot assigned\n',
 'M2ANot assigned\n',
 'M3ANorth York(Parkwoods)\n',
 'M4ANorth York(Victoria Village)\n',
 'M5ADowntown Toronto(Regent Park / Harbourfront)\n']

In [5]:
# extract the postal codes, getting the first 3 digits of each string. Add them to a list
pc = [];
for i in range(len(row_data)):
  pc.append(row_data[i][0:3])
pc[0:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [6]:
# extract the rest of data and then split it, adding the first element of the split to the Borough list
borough = [];
for i in range(len(row_data)):
  borough.append(row_data[i][3:].split('(')[0].split('\n')[0])
borough[0:5]


['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [7]:
# extract the rest of data and then split it, adding the second element of the split to the Neighborhoods list (add 'Not assigned', first element if there is not a second one)
neighb = [];
for i in range(len(row_data)):
  try: neighb.append(row_data[i][3:].split('(')[1].split(')')[0].strip().replace(' / ',', ')) 
  except: neighb.append(row_data[i][3:].split('(')[0].split('\n')[0])

neighb[0:5]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Regent Park, Harbourfront']

## Create a pandas DataFrame and cleanse the data

In [8]:
# create a dictionary to store the 3 columns needed in the dataFrame

data = {}
data['PostalCode'] = pc
data['Borough'] = borough
data['Neighborhood'] = neighb

#create a DataFrame

df = pd.DataFrame(data)
df[0:5]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
# take a look at the data grouping by 'Borough' and adjust the label
df.groupby('Borough').describe()

Unnamed: 0_level_0,PostalCode,PostalCode,PostalCode,PostalCode,Neighborhood,Neighborhood,Neighborhood,Neighborhood
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Borough,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Central Toronto,9,9,M4T,1,9,9,"Moore Park, Summerhill East",1
Downtown Toronto,17,17,M5H,1,17,17,Rosedale,1
Downtown TorontoStn A PO Boxes25 The Esplanade,1,1,M5W,1,1,1,Enclave of M5E,1
East Toronto,4,4,M4E,1,4,4,"India Bazaar, The Beaches West",1
East TorontoBusiness reply mail Processing Centre969 Eastern,1,1,M7Y,1,1,1,Enclave of M4L,1
East York,4,4,M4G,1,4,4,"Parkview Hill, Woodbine Gardens",1
East YorkEast Toronto,1,1,M4J,1,1,1,The Danforth East,1
Etobicoke,11,11,M9A,1,11,11,"New Toronto, Mimico South, Humber Bay Shores",1
EtobicokeNorthwest,1,1,M9W,1,1,1,"Clairville, Humberwood, Woodbine Downs, West H...",1
MississaugaCanada Post Gateway Processing Centre,1,1,M7R,1,1,1,Enclave of L4W,1


In [10]:
# adjust the label of some borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [11]:
print('The dimensions of the Data Frame are:', df.shape)

The dimensions of the Data Frame are: (180, 3)


## Use the geocode Python library to get postal codes coordinates

In [12]:
#!pip install geocoder # import geocoder, it did not work so we get the lat long coordinates from the csv link provided

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |███▎                            | 10kB 14.1MB/s eta 0:00:01[K     |██████▋                         | 20kB 20.3MB/s eta 0:00:01[K     |██████████                      | 30kB 10.6MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 8.7MB/s eta 0:00:01[K     |████████████████▋               | 51kB 5.6MB/s eta 0:00:01[K     |████████████████████            | 61kB 5.6MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 6.0MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 6.7MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 6.6MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.0MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592ca

In [32]:
# get the csv file

url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
latlong_df = pd.read_csv(url)
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
# it can be spotted that the column name 'Postal Code' has a space
# we are going to strip it in order to have common name with our df and use the merge method
latlong_df = latlong_df.rename(columns={"Postal Code": "PostalCode"})
df_geo = df.merge(latlong_df, how='left')
df_geo

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1A,Not assigned,Not assigned,,
1,M2A,Not assigned,Not assigned,,
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
...,...,...,...,...,...
175,M5Z,Not assigned,Not assigned,,
176,M6Z,Not assigned,Not assigned,,
177,M7Z,Not assigned,Not assigned,,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999
