# Week3 peer-graded Assignment: <ins> Segmenting and Clustering Neighborhoods in Toronto</ins>

## 1. Scraping of Wikipedia page

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
cn_postalcode_response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
cn_postalcode_soup     = BeautifulSoup(cn_postalcode_response.text)

### Generate the DataFrame from the table scraped from the wikipedia page

In [3]:
table_headers = cn_postalcode_soup.find('tbody').find('tr').text
header_list = [head for head in table_headers.split('\n') if head != '']

table_body = cn_postalcode_soup.find('tbody').findAll('tr')

dict_table = {header_list[0] : [],
              header_list[1] : [],
              header_list[2] : []
             }
for i in range(1,len(table_body)):
    table_row = table_body[i].text
    table_row = [elt for elt in table_row.split('\n') if elt != '']
    for j in range(3):
        dict_table[header_list[j]].append(table_row[j])
        
df_table = pd.DataFrame(dict_table)
df_table.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df_table.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,180,180,180
unique,180,11,100
top,M6J,Not assigned,Not assigned
freq,1,77,77


### Remove rows with not assigned Borough from the DataFrame

In [5]:
df_table_clean = df_table[df_table['Borough'] != 'Not assigned'].reset_index(drop=True)

### Replace Not assigned Neighborhood with corresponding Borough if assigned

In [6]:
def clean_table(row):
    if row[1] == 'Not assigned' and row[1] != 'Not assigned':
        row[2] = row[1]
    return row

df_table_clean.apply(clean_table, axis=1)
df_table_clean.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 2. Obtain Latitude and longitude coordinates of neighborhoods unsing geocoders api

In [7]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
import time

def compute_lat_long(address):
    geolocator = Nominatim(user_agent="capstone_project")
    location   = geolocator.geocode(address)
    if location is None:
        lat = None ; long = None
    else :
        lat = location.latitude; long = location.longitude
    return lat, long

latitude = []; longitude = []
for i in range(len(df_table_clean)):
    address = df_table_clean.PostalCode[i] + ' ' + \
                             df_table_clean.Borough[i]
    lat, long = compute_lat_long(address)
    if (lat is not None) and (long is not None):
        latitude.append(round(lat, 2)); longitude.append(round(long, 2))
    else :
        latitude.append(lat); longitude.append(long)
    
print(latitude, longitude)

[43.75, None, None, None, 43.66, None, 54.28, None, None, None, None, 43.64, 54.28, 43.73, None, None, None, 43.67, None, None, None, None, 43.76, None, None, None, None, None, None, None, 43.65, None, None, 43.75, None, None, None, 43.65, None, None, 43.74, None, None, 43.72, None, None, None, 43.67, None, None, None, None, 43.79, None, None, None, None, None, None, 43.75, None, None, None, 43.69, None, None, None, None, None, 43.72, None, None, None, None, None, None, None, 43.67, None, None, None, 43.72, None, None, None, None, None, None, None, None, 43.77, None, None, None, None, None, None, None, None, None, None, None, None] [-79.45, None, None, None, -79.38, None, -0.4, None, None, None, None, -79.54, -0.4, -79.35, None, None, None, -79.55, None, None, None, None, -79.23, None, None, None, None, None, None, None, -79.38, None, None, -79.45, None, None, None, -79.41, None, None, -79.48, None, None, -79.42, None, None, None, -79.3, None, None, None, None, -79.42, None, None, None

One sees that the **geocoders** does not reconized more than half of the adresses 

In [8]:
# Let add the requested latitude and longitude to the DataFrame
df_table_clean['latitude']  = latitude
df_table_clean['longitude'] = longitude
df_table_clean.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75,-79.45
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66,-79.38


#### Let find out the ratio of NaN values on the requested Latitude and Longitude

In [9]:
df_table_clean.isna().sum()/len(df_table_clean)*100

PostalCode       0.00000
Borough          0.00000
Neighborhood     0.00000
latitude        79.61165
longitude       79.61165
dtype: float64

More than **79%** addresses are not reconized by geocoders api. <br>
I found that this <ins>**value changes**</ins> each time the cell is run. <br>
Therefore, it is important to import latitudes and longitudes from the csv file provides

In [10]:
# import the csv file from the provided url
df_lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
df_lat_long.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_lat_long.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# Drop the proviously added latitude and longitude
df_table_clean.drop(columns=['latitude','longitude'], inplace=True)

# Join both DataFrame with the outher joint method
df_table_cord = pd.merge(df_table_clean, df_lat_long, on='PostalCode', how='outer')
df_table_cord.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [12]:
# Check the rate of NaN value un the final DataFrame
df_table_cord.isna().sum()/len(df_table_cord)*100

PostalCode      0.0
Borough         0.0
Neighborhood    0.0
Latitude        0.0
Longitude       0.0
dtype: float64

In [13]:
# Check the shape of the final table
df_table_cord.shape

(103, 5)

In [14]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Representation on the map

In [15]:
# Generate map centred around M5A Downtown Toronto
M6B_index = df_table_cord[df_table_cord['PostalCode']=='M6B'].index
venues_map = folium.Map(location=[df_table_cord.Latitude[M6B_index[0]], df_table_cord.Longitude[M6B_index[0]]],
                                  zoom_start=11)

# Add a red circle marker to represent the Conrad Hotel
folium.features.CircleMarker(
                [df_table_cord.Latitude[M6B_index[0]], df_table_cord.Longitude[M6B_index[0]]],
                radius=15,
                color='red',
                popup='Conrad Hotel',
                fill = True,
                fill_color = 'red',
                fill_opacity = 0.7
                ).add_to(venues_map)

# Add the 103 addresses to the map as blue circle markers
for lat, lng, label in zip(df_table_cord.Latitude, df_table_cord.Longitude, df_table_cord.PostalCode):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(venues_map)

# display map
venues_map