# Segmenting and Clustering Neighborhoods in Toronto

### A project assignment for Applied Data Science for IBM/Coursera

#### Done By: Shravan Bharadwaj

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import pgeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Scraping Webpage by using BeautifulSoup
    
     Scraping the webpage by using BeautifulSoup and storing the tables data into a variable.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'    # Creating URL
toronto_source = requests.get(url).text                                    # URL to Text
soup = BeautifulSoup(toronto_source,'html.parser')                         # soup object
tables=soup.find('table')

### Extracting the Information from the tables Data
    
    The Data of CODE, BOROUGHS and NEIGHBORHOODS are extracted into 3 lists respectively.

In [3]:
li_code,li_borough=[],[]                              # List to store Code, Borough and Neighborhoods
li_neighborhood=[]
for row in tables.find_all('tr'):                     # Looping to find the tag 'tr'
    cols=row.find_all('td')                           # Finding the tag 'td'
    for info in cols:                                 # Looping through the 'td' tag
        info=info.get_text(separator='',strip=True)   # Converting it to a String with get_text Method
        li_code.append(info[0:3])                     # First 3 Characters are CODES.
        try:                                          # Try block to get the index of '(' and ')'
            a=info.index('(')
            b=info.index(')')
        except:                                       # Since some do not have any Information it will be 
            li_borough.append('Not Assigned')         # "Not Assigned"
            li_neighborhood.append('Not Assigned')  
        else:                                         # Some Info which has will be sliced and appended to
            li_borough.append(info[3:a])              # the respective lists.
            li_neighborhood.append(info[a+1:b])

### Obtaining a Pandas Data Frame:
    
    1. The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '.
    2. A Dictionary is created with the help of lists.
    3. The Dictionary is converted to a Pandas DATAFRAME.

In [5]:
# The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '
li_neigh=[]
for i,j in enumerate(li_neighborhood):
    if '/' in j:
        j=j.replace('/',', ')
        li_neigh.append(j)
    else:
        li_neigh.append(j)

# Creating the Pandas Dataframe by creating a dictionary of columns required:
    
di={'Postal Code':li_code,'Borough':li_borough,'Neighborhood':li_neigh}                 # Dictionary
toronto_df =pd.DataFrame(di)                                                            # Data Frame
toronto_df.head(15) 

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not Assigned,Not Assigned
1,M2A,Not Assigned,Not Assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,Not Assigned,Not Assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
# Description of the Data Frame:
print('The Shape of DF: ',toronto_df.shape)
print('******************************************************************************************************')
print('Description: ',toronto_df.describe())
print('******************************************************************************************************')
print('Info: ',toronto_df.info())

The Shape of DF:  (180, 3)
******************************************************************************************************
Description:         Postal Code       Borough  Neighborhood
count          180           180           180
unique         180            16            99
top            M4L  Not Assigned  Not Assigned
freq             1            77            77
******************************************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB
Info:  None


### Cleaning the data frame:
    
    Deleting rows which have 'Not Assigned' Values.

In [7]:
toronto_df1= toronto_df[toronto_df.Borough!="Not Assigned"]
toronto_df1.reset_index(drop=True,inplace=True)
toronto_df1.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Data Frame shape:

In [8]:
print('The Shape of New Data Frame is: ',toronto_df1.shape)

The Shape of New Data Frame is:  (103, 3)


### Adding new columns of Latitude and Longitude:

    A list of none values will be added to the Data Frame.

In [9]:
lati_list = [None] * toronto_df1.shape[0]
longi_list = [None] * toronto_df1.shape[0]
toronto_df1['Latitude'] = lati_list
toronto_df1['Longitude'] = longi_list
toronto_df1.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toronto_df1['Latitude'] = lati_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toronto_df1['Longitude'] = longi_list


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Queen's Park,Ontario Provincial Government,,


### Obtaining the Latitudes and Longitudes:
    
    Using Geocoder package we will get the information regarding Latitude and Longitude.
    
    Note: Due to the result of getting 'None' values for certain adresses  in geopy Nominatim Library,
          pgeocode library is used to get latitudes and longitudes using the Postal Codes.
    

In [10]:
for i, code in enumerate(toronto_df1['Postal Code']):
    nomi = pgeocode.Nominatim('ca')                           # Canada code is 'ca'
    postal_code = code                                        # Postal Code
    location = nomi.query_postal_code(postal_code)            # getting location details by postal code
    latitude = location.latitude                              # Latitude
    longitude = location.longitude                            # Longitude
    toronto_df1['Latitude'][i] = latitude
    toronto_df1['Longitude'][i] = longitude

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
print('The Data Frame has {} boroughs '.format(len(toronto_df1['Borough'].unique())))

The Data Frame has 15 boroughs 


### Cleaning the Data Frame:
     
    It seems that the pgeocode library could not find the latitude and longitude of row 76. So this row will be
    dropped.

In [12]:
toronto_df1.iloc[76]  # Show the details of row 76

Postal Code                                                  M7R
Borough         MississaugaCanada Post Gateway Processing Centre
Neighborhood                                      Enclave of L4W
Latitude                                                     NaN
Longitude                                                    NaN
Name: 76, dtype: object

In [13]:
toronto_df1.drop(labels=76,axis=0,inplace=True)  # Dropping Row 76

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
toronto_df1.reset_index(drop=True, inplace=True) # Resting Index

### Vizualizing using Folium Map

##### Get the address of Toronto:

In [17]:
address = 'Toronto, Ontario, CA'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
lati_toronto = location.latitude
longi_toronto = location.longitude
print(f'The Cordinates of Toronto is {lati_toronto} and {longi_toronto}.')

The Cordinates of Toronto is 43.6534817 and -79.3839347.


##### Map with all Neighborhoods:

In [18]:
toronto_map= folium.Map(location=[lati_toronto,longi_toronto],zoom_start=10)

for lat,lng,borough,neighborhood in zip(toronto_df1['Latitude'],toronto_df1['Longitude'],toronto_df1['Borough'],toronto_df1['Neighborhood']):
    
    label = '{}, {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker( [lat,lng],radius = 5, popup = label, color ='red', fill =True, fill_color = 'red', fill_capacity=0.7, parse_html=False).add_to(toronto_map)
        

toronto_map

#### Display the DataFrame with coordinates

In [19]:
toronto_df1.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District,Ryerson",43.6572,-79.3783
