# IBM Professional Certificate in Data Science (Capstone)

This notebook is focused on the capstone project of the IBM Professional Certificate in Data Science

### Import libraries

In [131]:
import pandas as pd  # Dataframes
import numpy as np  # Efficient arrays
import warnings  # Filter warnings
import folium  # maps
from geopy.geocoders import Nominatim  # Address --> lat & long
import requests  # API requests
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

# Filter out all annoying warnings
warnings.filterwarnings("ignore")

In [2]:
# Required print statement

print("Hello, Capstone Project Course!")

Hello, Capstone Project Course!


### Week 3 part 1: scrape Wikipedia into a dataframe

In [5]:
# Collect all tables from the page

wiki_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [32]:
# Select the first table as the dataframe

df = wiki_data[0]

In [33]:
# View the dataframe

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [34]:
# Drop all rows with no Borough

df = df[df["Borough"] != "Not assigned"]

In [35]:
# If a postcode has no Neighbourhood, assign the borough as the neighbourhood

df["Neighbourhood"].replace("Not assigned", df["Borough"], inplace=True)

In [36]:
# Groupby postcodes, and convert neighbourhood to a list

df = df.groupby(by=["Postcode", "Borough"])["Neighbourhood"].apply(list).reset_index()

In [37]:
# Combine the lists into strings

df["Neighbourhood"] = df["Neighbourhood"].apply(lambda x: ", ".join(x))

In [38]:
# View the dataframe

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [39]:
# Print out the shape of the current dataframe

df.shape

(103, 3)

### Week 3 part 2: attach location information 

In [40]:
# Load in the dataframe

geo_df = pd.read_csv("Geospatial_Coordinates.csv")

In [41]:
# View the geo dataframe

geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
# Rename the geo postcode column to match the borough df

geo_df.rename(columns={"Postal Code":"Postcode"}, inplace=True)

In [44]:
# Join the dataframes together

df = df.merge(geo_df, on="Postcode")

### Week 3 part 3: Plot the boroughs 

In [48]:
# Get the lat & long for Toronto

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode("Toronto, Ontario")

In [52]:
# Create a map

tor_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(tor_map)  
    
tor_map

### Week 3 part 4: Clustering postcodes

I wanted to cluster postcodes (not neighbourhoods or boroughs) by the types of venues found there.

In [55]:
# Create 4square credentials

CLIENT_ID = '4GLLROGJFFVYQ5CALGAYFYBLAN5Y50SROVY4ULAKVBPJSYXJ' # your Foursquare ID
CLIENT_SECRET = 'N2GZM4QEABD4IFWWT0WGPUE2DL45NAX5VU5ED0PXMGB4T4SM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [84]:
# Function to extract venues for each postcode

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [85]:
# Get venues for all postcodes

venues = getNearbyVenues(names=df['Postcode'],
                         latitudes=df['Latitude'],
                         longitudes=df['Longitude'])

In [151]:
# one-hot encode venue information

# one hot encoding
ven_df = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
ven_df['Postcode'] = venues['Postcode']

In [152]:
# Groupby postcodes

ven_df = ven_df.groupby('Postcode').mean().reset_index()

In [154]:
# Add location data onto ven_df

ven_df = ven_df.merge(df[["Postcode", "Latitude", "Longitude"]], on="Postcode")

Once the venues df has been created and encoded appropriately, we can cluster the different postcodes.

In [153]:
# set number of clusters
kclusters = 7

clusters = ven_df.drop(['Postcode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clusters)

# Add cluster labels onto the venue df
ven_df["cluster"] = kmeans.labels_

In [171]:
# List the most popular categories per cluster.

cluster_df = ven_df.groupby("cluster").mean()

for row in cluster_df.iterrows():
    cats = [x for x in row[1][:-2] if x > 0]
    print cats
    break

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(cats)? (<ipython-input-171-29ae53e54c0e>, line 7)

Display the postcode markers on a map, coloured by clusters.

In [155]:
# create map
map_clusters = folium.Map(location=[location.latitude, location.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ven_df['Latitude'], ven_df['Longitude'], ven_df['Postcode'], ven_df['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters