# IBM Applied Data Science Capstone Course by Coursera

## Segmentation and Clustering Part 1



In [1]:
#import all libraries
from bs4 import BeautifulSoup
import urllib3.request
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import folium
import os
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Web Scrapping

In [2]:
#web scrapping
web_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
#parse web_data from html
soup = BeautifulSoup(web_data, 'html.parser')

#### Create a list with specified values

In [4]:
# create lists for columns specified
postalCodeList = []
boroughList = []
neighborhoodList = []

#### Utilize Beautifulsoap

In [5]:
#utilize Beautiful Soap
# locate the table
soup.find('table').find_all('tr')

#locate all the rows of the table
soup.find('table').find_all('tr')

#locate all the table data per row
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

#### Import data into lists

In [6]:
#import data into the different lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n')) 

#### Creating DataFrame

In [7]:
# create a DataFrame from the specified lists
tor_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

In [8]:
tor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Dropping 'not assigned' cells

In [9]:
# drop cells with a borough - Not assigned
tor_df_dropna = tor_df[tor_df.Borough != "Not assigned"].reset_index(drop=True)
tor_df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### Grouping Postalcodes together 

In [10]:
# group neighborhoods in the same borough
tor_df_grouped = tor_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
tor_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Changing not assigned in neighborhood to Borough

In [11]:
# When Neighborhood=Not assigned, value should be similar to Borough
for index, row in tor_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
tor_df_grouped.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Dataframe shape

In [12]:
#print number of rows , columns
tor_df_grouped.shape

(103, 3)

### Part 2 

#### Load Coordinates

In [13]:
# load the coordinates from the csv file
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
# rename column "PostalCode"
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge two tables together

In [15]:
# merge two table using column "PostalCode"
tor_df_new = tor_df_grouped.merge(coordinates, on="PostalCode", how="left")
tor_df_new.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Part 3

#### Create Dataframe with only Toronto data

In [16]:
#use Toronto boroughs by specify word contains Toronto
tor_df_new = tor_df_new[tor_df_new['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(tor_df_new.shape)
tor_df_new.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### Construct Toronto Map

In [17]:
# Use geopy library to get the latitude and longitude values of Toronto.
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} is {}, {}.'.format(address, latitude, longitude))

  """


The geograpical coordinate of Toronto is 43.653963, -79.387207.


#### Create map of Toronto using latitude and longitude

In [19]:
# create map of Toronto using latitude and longitude values and mark the neighborhoods
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_df_new['Latitude'], tor_df_new['Longitude'], tor_df_new['Borough'], tor_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor