# IBM Applied Data Science Capstone Course by Coursera

## Segmentation and Clustering Part 1



In [3]:
#import all libraries
from bs4 import BeautifulSoup
import urllib3.request
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import folium
import os
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Web Scrapping

In [4]:
#web scrapping
web_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [6]:
#parse web_data from html
soup = BeautifulSoup(web_data, 'html.parser')

#### Create a list with specified values

In [7]:
# create lists for columns specified
postalCodeList = []
boroughList = []
neighborhoodList = []

#### Utilize Beautifulsoap

In [8]:
#utilize Beautiful Soap
# locate the table
soup.find('table').find_all('tr')

#locate all the rows of the table
soup.find('table').find_all('tr')

#locate all the table data per row
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

#### Import data into lists

In [9]:
#import data into the different lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n')) 

#### Creating DataFrame

In [10]:
# create a DataFrame from the specified lists
tor_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

In [11]:
tor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Dropping 'not assigned' cells

In [14]:
# drop cells with a borough - Not assigned
tor_df_dropna = tor_df[tor_df.Borough != "Not assigned"].reset_index(drop=True)
tor_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### Grouping Postalcodes together 

In [15]:
# group neighborhoods in the same borough
tor_df_grouped = tor_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
tor_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Changing not assigned in neighborhood to Borough

In [19]:
# When Neighborhood=Not assigned, value should be similar to Borough
for index, row in tor_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
tor_df_grouped.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Dataframe shape

In [21]:
#print number of rows , columns
tor_df_grouped.shape

(103, 3)