# Segmenting and Clustering Neighborhoods in Toronto -Part 1

### Importing required libraries


In [1]:
!pip install BeautifulSoup4
!pip install geopy
!pip install -U scikit-learn scipy matplotlib
!conda install -c conda-forge folium=0.5.0 --yes 
!pip install lxml
print("INSTALLED ALL LIBRARIES")

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 18.6MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.1 soupsieve-2.0.1
Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/07/e1/9c72de674d5c2b8fcb0738a5ceeb5424941fefa080bfe4e240d0bacb5a38/geopy-2.0.0-py3-none-any.whl (111kB)
[K     |████████████████████████████████| 112kB 6.0MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e2

In [2]:
import numpy as np

import pandas as pd

import json

from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

import lxml.html as lh

print('All required libraries were imported!')

All required libraries were imported!


### Importing the wikipedia page

In [7]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle (page), to handle contents of the website
page = requests.get(url)

#Store contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>...</tr> of html
tr_elements = doc.xpath('//tr')

In [8]:
#Check the length of first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

#### This shows us that there are THREE columns per row 

In [9]:
#Parse first row as header
tr_elements = doc.xpath('//tr')

#Creat an empty list
col = []
i = 0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name = t.text_content()
    print('%d:"%s"'%(i, name))
    col.append((name, []))

1:"Postal Code
"
2:"Borough
"
3:"Neighborhood
"


## Creating pandas dataframe

#### Each header is appended to a tuple along with an empty list

In [10]:
#Since the first row is header, the data will be stored from the second row onwards
for j in range(1, len(tr_elements)):
    #T is our j'th row
    T = tr_elements[j]
    
    #If row is not of size 3, the <//tr> data is not from the table
    if len(T)!=3:
        break
    
    #i is the index of the column
    i = 0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content()
        #Check if any row is empty
        if i>0:
            #Convert any numerical value to integers
            try:
                data = int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for next column
        i+=1

In [11]:
#Checking the length of each column, which should ideally be the same
[len(C) for (title,C) in col]

[181, 181, 181]

#### This shows us that the three columns have exactly 181 rows

#### Creating pandas dataframe

In [29]:
Dict = {title:column for (title,column) in col}
df = pd.DataFrame(Dict)

In [30]:
#Accessing the dataframe
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


#### Cleaning the messy strings

In [31]:
df = df.replace('\n',' ',regex=True)
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Renaming the columns

In [32]:
df.columns = ['PostCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Dropping all cells with a borough that is 'Not assigned'

In [37]:
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True)

#Rest the index and drop the previous index
df = df.reset_index(drop = True)

df.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### Combining Neighborhood based on similar PostCode and Borough 

In [38]:
df = df.groupby(['PostCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df.columns = ['PostCode', 'Borough', 'Neighborhood']
df.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
0,,Canadian postal codes,
1,M1A,Not assigned,Not assigned
2,M1B,Scarborough,"Malvern, Rouge"
3,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
4,M1E,Scarborough,"Guildwood, Morningside, West Hill"
5,M1G,Scarborough,Woburn
6,M1H,Scarborough,Cedarbrae
7,M1J,Scarborough,Scarborough Village
8,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
9,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"


#### Removing any space at the start of the string

In [40]:
df['Neighborhood'] = df['Neighborhood'].str.strip()

#### Assigning Borough values to the Neighborhood where value is 'Not assigned'

In [41]:
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df['Borough']

In [43]:
df.shape

(181, 3)

In [None]:
df.to