### Web Scraping using pandas
#### I used pandas to read wiki html page as list

In [2]:
import pandas as pd
from urllib.request import urlopen

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = urlopen(url)
df_list = pd.read_html(r) # this parses all the tables in webpages to a list
df = df_list[0] #index 0 since it is the first table in the wiki page

### Data wrangling
#### Drop the unassigned borough
#### Replace / by commas

In [3]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df['Neighborhood'].replace(' / ',',',regex=True,inplace=True)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government"


In [4]:
df.shape

(103, 3)

### Import csv Geospatial_coordinates to fetch latitude and longtitude of postal codes

In [5]:
df_coor = pd.read_csv('http://cocl.us/Geospatial_data')
df_coor.head()

#The below code was to fetch coordinates from geocoder but i rather prefer importing csv file
#_____________________________________________________________________________________________
#import geocoder as gc # import geocoder
#lat=[]
#long=[]
#for pc in df['Postal code']:
#    # initialize your variable to None
#    lat_lng_coords = None
#    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#      g = gc.google('{}, Toronto, Ontario'.format(pc))
#      lat_lng_coords = g.latlng
#    lat.append(lat_lng_coords[0])
#    long.append(lat_lng_coords[1])
#df['Latitude'] =lat
#df['Longtitude'] =long   

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Join first dataframe with coordinates dataframe where postal codes are equal

In [6]:
# Rename columns so they are equal in order to apply join
df_coor.rename(columns={'Postal Code':'Postal code'}, inplace = True)
df_joined = pd.merge(df,df_coor)
df_joined.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park,Ontario Provincial Government",43.662301,-79.389494


### Visualize the dataframe
#### Install and Import Foluim for visulazing maps

In [7]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

print('Folium installed')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

#### Fetch latitude and longtitude of toronto

In [10]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim()
city ="Toronto"
country ="Canada"
loc = geolocator.geocode(city+','+ country)
longitude = loc.longitude
latitude = loc.latitude
print("latitude is :-" ,loc.latitude,"\nlongtitude is:-" ,loc.longitude)

latitude is :- 43.6534817 
longtitude is:- -79.3839347


#### checking unique Borough values in dataframe

In [12]:
df_joined['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

#### colorize postal codes on map accroding to their unique  borough. each borough has a unique color pf postal codes

In [14]:
def colorFunc(b):
    if b == 'Mississauga':
        return 'black'
    elif b == 'York':
        return 'orange'
    elif b in 'Etobicoke':
        return 'green'
    elif b in 'Scarborough':
        return 'grey'
    elif b in'East York':
        return 'red'
    elif b in 'North York':
        return 'yellow'
    elif 'Toronto' in b:
        return 'blue'
    
Map1 = folium.Map(location=[latitude, longitude], zoom_start=10) # generate map centred to Toronto City
# add circle markers
for lat, lng, label,b in zip(df_joined['Latitude'], df_joined['Longitude'], df_joined['Postal code'],df_joined['Borough']):
        folium.features.CircleMarker(
            [lat, lng],
            radius=5,
            poup=label,
            fill=True,
            color=colorFunc(b),
            fill_color=colorFunc(b),
            fill_opacity=0.6).add_to(Map1)
Map1

# Part 3
## Analyzing and Clustering Toronto Neighborhoods
### First we should find unique neighborhoods in Toronto district and find their coordinates

In [44]:
#method to flatten a 2 dimensinal list to a 1 dimensinal list
def flatten(l):
    flatList = []
    for elem in l:
        # if an element of a list is a list
        # iterate over this list and add elements to flatList 
        if type(elem) == list:
            for e in elem:
                flatList.append(e)
        else:
            flatList.append(elem)
    return flatList

#From postal codes dataframe we return only postal codes found in toronta and their neghborhoods
toronto = df_joined[df_joined['Borough'].str.contains('Toronto')]
unique_neighborhoods = []

#split the comma seperated neighborhoods in each borough and insert them in a list
for i in toronto['Neighborhood']:
    unique_neighborhoods.append(i.split(','))  
# convert to a unique one dimensinal list
unique_neighborhoods = flatten(unique_neighborhoods)
unique_neighborhoods = list(set(unique_neighborhoods))
len(unique_neighborhoods)
#we have 74 unique neighborhoods in Toronto to be analyzed

74

### Now create a dataframe with all neighborhoods in toronto and their coordinates
##### We are looping through the unique neighborhoods and using geolocator to fetch latitude and longitude for every neighborhood

In [94]:
#create a dataframe with unique neighborhoods of toronto
toronto_neighborhoods = pd.DataFrame(unique_neighborhoods)
toronto_neighborhoods.rename(columns={0:'Neighborhoods'}, inplace=True)

# add coordinates of each neighborhood, we will use the geolocator library
city ="Toronto"
country ="Canada"
long=[]
lat=[]
for i in toronto_neighborhoods['Neighborhoods']:
    loc = geolocator.geocode(i+', '+city+', '+ country)
    if loc is None:
        long.append("nan")
        lat.append("nan")
        continue
    long.append(loc.longitude)
    lat.append(loc.latitude)
    
toronto_neighborhoods['Latitude'] = lat
toronto_neighborhoods['Longitude'] = long
toronto_neighborhoods = toronto_neighborhoods[toronto_neighborhoods['Latitude'] != 'nan']
toronto_neighborhoods.head()


Unnamed: 0,Neighborhoods,Latitude,Longitude
0,Christie,43.6641,-79.4184
1,Brockton,43.6509,-79.44
2,Dufferin,43.6602,-79.4357
3,Forest Hill SE,43.6936,-79.4139
5,Rosedale,43.6784,-79.3807


### Now create a map marking the neighborhoods of toronto

In [93]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10) # generate map centred to Toronto City

# add markers marking neighborhoods in toronto
for lat, lng, label in zip(toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude'], toronto_neighborhoods['Neighborhoods']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Explore every neighborhood
#### First we create a function that fetches venues from each neighbohood in Toronto using Foursquare location data provider

In [100]:
#Define firts foursquare API calls credentials
CLIENT_ID = 'ZQYZUVJNNZNSIDV34CKP5Z34NGU2RI12TJDRGGF13LLI25TM'
CLIENT_SECRET = 'FCNT3KN50H454BNCDKR0EPMRPL2PXZVWBBGF1BTGPW0LVIM3'
VERSION = '20180605' 
#import Foursquare required libraries
import requests # library to handle requests


def getNearbyVenues(names, latitudes, longitudes, radius=500,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [101]:
toronto_venues = getNearbyVenues(names=toronto_neighborhoods['Neighborhoods'],
                                   latitudes=toronto_neighborhoods['Latitude'],
                                   longitudes=toronto_neighborhoods['Longitude']
                                  )
print(toronto_venues.shape)
toronto_venues.head()

Christie
Brockton
Dufferin
Forest Hill SE
Rosedale
Davisville North
Church and Wellesley
Chinatown
Lawrence Park
Adelaide
Studio District
Kensington Market
Moore Park
Little Portugal
King
Harbourfront
Exhibition Place
King and Spadina
Roncesvalles
Union Station
St. James Town
The Annex
Riverdale
Design Exchange
Parkdale Village
Toronto Dominion Centre
High Park
Yorkville
Runnymede
Toronto Islands
Ryerson
Harbourfront West
Bathurst Quay
Trinity
Swansea
North Midtown
Deer Park
Roselawn
The Junction South
Harbord
Richmond
Summerhill West
First Canadian Place
Underground city
Cabbagetown
University of Toronto
Rathnelly
Davisville
Queen's Park
The Beaches West
North Toronto West
Regent Park
Commerce Court
Central Bay Street
Dovercourt Village
Victoria Hotel
Garden District
Forest Hill North & West
South Hill
South Niagara
Harbourfront East
CN Tower
Parkdale
The Beaches
Grange Park
Berczy Park
The Danforth West
India Bazaar
Summerhill East
(3539, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Christie,43.664111,-79.418405,Christie Pits Park,43.664177,-79.420466,Park
1,Christie,43.664111,-79.418405,Tacos El Asador,43.663751,-79.416954,Taco Place
2,Christie,43.664111,-79.418405,The Good Neighbour,43.663983,-79.41664,Coffee Shop
3,Christie,43.664111,-79.418405,Hodo Kwaja 호도과자,43.66424,-79.415579,Dessert Shop
4,Christie,43.664111,-79.418405,Sunrise House Korean Restaurant 해뜨는집,43.664183,-79.415746,Korean Restaurant


### Find how many unique categories of venues are found in toronto neighborhoods

In [107]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 289 uniques categories.


### Analyse each neighborhood

In [110]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Aquarium,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.041667,0.041667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0
4,CN Tower,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0


### Return a dataframe that has the most common 10 veneus in every neighborhod

In [195]:
import numpy as np

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Gym,American Restaurant,Gastropub,Japanese Restaurant,Cosmetics Shop,Seafood Restaurant,Bookstore
1,Bathurst Quay,Coffee Shop,Café,Park,Sushi Restaurant,Bank,Garden,Sculpture Garden,Grocery Store,Gym,Ramen Restaurant
2,Berczy Park,Coffee Shop,Café,Restaurant,Japanese Restaurant,Hotel,Italian Restaurant,Bakery,Cocktail Bar,Beer Bar,Seafood Restaurant
3,Brockton,Pizza Place,Bar,Vietnamese Restaurant,Park,French Restaurant,Dive Bar,Boutique,Café,Gastropub,Korean Restaurant
4,CN Tower,Hotel,Coffee Shop,Pizza Place,Scenic Lookout,Baseball Stadium,Gym,Ice Cream Shop,Aquarium,Monument / Landmark,Italian Restaurant


### Clustering Neighborhoods using k-means

In [196]:
#import kmeans library sklearn
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0,n_init = 12).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_neighborhoods

toronto_merged.rename(columns={'Neighborhoods':'Neighborhood'},inplace=True)
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(20) # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Christie,43.6641,-79.4184,1,Korean Restaurant,Coffee Shop,Japanese Restaurant,Grocery Store,Café,Sandwich Place,Pub,Cocktail Bar,Indian Restaurant,Ice Cream Shop
1,Brockton,43.6509,-79.44,2,Pizza Place,Bar,Vietnamese Restaurant,Park,French Restaurant,Dive Bar,Boutique,Café,Gastropub,Korean Restaurant
2,Dufferin,43.6602,-79.4357,2,Bar,Bakery,Coffee Shop,Café,Thrift / Vintage Store,Sandwich Place,Beer Store,Restaurant,Cocktail Bar,Mexican Restaurant
3,Forest Hill SE,43.6936,-79.4139,3,Park,Playground,Bank,Arts & Crafts Store,Farmers Market,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
4,Rosedale,43.6784,-79.3807,3,Park,Playground,Bike Trail,Farmers Market,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
5,Davisville North,43.6979,-79.3973,1,Italian Restaurant,Sushi Restaurant,Coffee Shop,Trail,Furniture / Home Store,Pub,Convenience Store,Pharmacy,Sporting Goods Shop,Café
6,Church and Wellesley,43.6655,-79.3838,1,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Grocery Store,Restaurant,Yoga Studio,Hotel,Café,Mediterranean Restaurant,Gastropub
7,Chinatown,43.6529,-79.398,2,Café,Dessert Shop,Coffee Shop,Mexican Restaurant,Vegetarian / Vegan Restaurant,Clothing Store,Bakery,Bar,Vietnamese Restaurant,Cheese Shop
8,Lawrence Park,43.7292,-79.4033,1,Sushi Restaurant,Bakery,Coffee Shop,Italian Restaurant,Bank,Fast Food Restaurant,Asian Restaurant,Pub,Deli / Bodega,Clothing Store
9,Adelaide,43.6505,-79.3795,1,Coffee Shop,Café,Restaurant,Gym,American Restaurant,Gastropub,Japanese Restaurant,Cosmetics Shop,Seafood Restaurant,Bookstore


### Visualize clusters

In [197]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters