# Segmenting and Clustering Neighborhoods in Toronto

### A project assignment for Applied Data Science for IBM/Coursera

#### Done By: Shravan Bharadwaj

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import pgeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Scraping Webpage by using BeautifulSoup
    
     Scraping the webpage by using BeautifulSoup and storing the tables data into a variable.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'    # Creating URL
toronto_source = requests.get(url).text                                    # URL to Text
soup = BeautifulSoup(toronto_source,'html.parser')                         # soup object
tables=soup.find('table')

### Extracting the Information from the tables Data
    
    The Data of CODE, BOROUGHS and NEIGHBORHOODS are extracted into 3 lists respectively.

In [3]:
li_code,li_borough=[],[]                              # List to store Code, Borough and Neighborhoods
li_neighborhood=[]
for row in tables.find_all('tr'):                     # Looping to find the tag 'tr'
    cols=row.find_all('td')                           # Finding the tag 'td'
    for info in cols:                                 # Looping through the 'td' tag
        info=info.get_text(separator='',strip=True)   # Converting it to a String with get_text Method
        li_code.append(info[0:3])                     # First 3 Characters are CODES.
        try:                                          # Try block to get the index of '(' and ')'
            a=info.index('(')
            b=info.index(')')
        except:                                       # Since some do not have any Information it will be 
            li_borough.append('Not Assigned')         # "Not Assigned"
            li_neighborhood.append('Not Assigned')  
        else:                                         # Some Info which has will be sliced and appended to
            li_borough.append(info[3:a])              # the respective lists.
            li_neighborhood.append(info[a+1:b])

### Obtaining a Pandas Data Frame:
    
    1. The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '.
    2. A Dictionary is created with the help of lists.
    3. The Dictionary is converted to a Pandas DATAFRAME.

In [4]:
# The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '
li_neigh=[]
for i,j in enumerate(li_neighborhood):
    if '/' in j:
        j=j.replace('/',', ')
        li_neigh.append(j)
    else:
        li_neigh.append(j)

# Creating the Pandas Dataframe by creating a dictionary of columns required:
    
di={'Postal Code':li_code,'Borough':li_borough,'Neighborhood':li_neigh}                 # Dictionary
toronto_df =pd.DataFrame(di)                                                            # Data Frame
toronto_df.head(15) 

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not Assigned,Not Assigned
1,M2A,Not Assigned,Not Assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,Not Assigned,Not Assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [5]:
# Description of the Data Frame:
print('The Shape of DF: ',toronto_df.shape)
print('******************************************************************************************************')
print('Description: ',toronto_df.describe())
print('******************************************************************************************************')
print('Info: ',toronto_df.info())

The Shape of DF:  (180, 3)
******************************************************************************************************
Description:         Postal Code       Borough  Neighborhood
count          180           180           180
unique         180            16            99
top            M8C  Not Assigned  Not Assigned
freq             1            77            77
******************************************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB
Info:  None


### Cleaning the data frame:
    
    Deleting rows which have 'Not Assigned' Values.

In [6]:
toronto_df1= toronto_df[toronto_df.Borough!="Not Assigned"]
toronto_df1.reset_index(drop=True,inplace=True)
toronto_df1.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Data Frame shape:

In [7]:
print('The Shape of New Data Frame is: ',toronto_df1.shape)

The Shape of New Data Frame is:  (103, 3)
