# Coursera Data Science Capstone
<hr/>

## 1: Import packages

In [1]:
import numpy as np
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors

## 2: Data collection

### (a) Get table of Edinburgh postcodes

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/EH_postcode_area")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[1]
postcodes = pd.read_html(str(table))
postcodes_json = postcodes[0].to_json(orient='records')
postcodes = pd.read_json(postcodes_json)
postcodes

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area(s)
0,EH1,EDINBURGH,Mostly consists of Edinburgh's Old Town. Also ...,
1,EH2,EDINBURGH,The New Town and central commercial area of Ed...,
2,EH3,EDINBURGH,An odd shaped area surrounding EH1 and EH2 to ...,
3,EH4,EDINBURGH,Radiates from the older and more central areas...,
4,EH5,EDINBURGH,Based on a village formerly separate from the ...,
5,EH6,EDINBURGH,"Covers Leith, as well as Newhaven bordering it...",
6,EH7,EDINBURGH,The inner city area between central Edinburgh ...,
7,EH8,EDINBURGH,"The inner city Southside, Newington and Canong...",
8,EH9,EDINBURGH,"The inner city, Marchmont and Grange, Blackfor...",
9,EH10,EDINBURGH,"A corridor along the A702 from Bruntsfield, th...","City of Edinburgh, Midlothian"


### (b) Use Postcodes.io API to get postcode latitude and longitude

#### Define function to get series of locations for a series of postcode outcodes

In [3]:
def getOutcodesLocation(outcodes):
    resultlist = []
    for outcode in outcodes:
        url = 'https://api.postcodes.io/outcodes/{}'.format(outcode)
        resultlist.append(requests.get(url).json()['result'])
    return(resultlist)

#### Add column with outcode API result

In [4]:
# Add columns with postcode API result
postcodes['Result'] = getOutcodesLocation(postcodes['Postcode district'])

### (c) Clean dataframe of postcodes and locations

In [5]:
# Split results column into separate columns
postcodes = pd.concat([postcodes.drop(['Result'], axis=1), postcodes['Result'].apply(pd.Series)], axis=1)

# Reformat column labels
postcodes.columns = postcodes.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Drop unnecessary columns
postcodes.drop(['local_authority_areas', 'post_town', 'outcode', 'admin_district', 'parish', 'admin_county', 'admin_ward', 'country', 'northings', 'eastings'], axis = 1, inplace = True)

# Drop special postcodes
postcodes = postcodes[~postcodes.coverage.str.contains("Special postcode")]

# Show postcodes
postcodes

Unnamed: 0,postcode_district,coverage,longitude,latitude
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,-3.193068,55.950601
1,EH2,The New Town and central commercial area of Ed...,-3.202204,55.952278
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,-3.206313,55.952185
3,EH4,Radiates from the older and more central areas...,-3.260869,55.962427
4,EH5,Based on a village formerly separate from the ...,-3.222157,55.975985
5,EH6,"Covers Leith, as well as Newhaven bordering it...",-3.175356,55.97244
6,EH7,The inner city area between central Edinburgh ...,-3.164642,55.960225
7,EH8,"The inner city Southside, Newington and Canong...",-3.164188,55.948708
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",-3.185364,55.933011
9,EH10,"A corridor along the A702 from Bruntsfield, th...",-3.211108,55.919823


### (d) Get Venues from Foursquare

#### Define Foursquare credentials

In [6]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: VKSYDPNJNRYWDFOGX343VHXJVXTCSYEA2SB4D5OB3GGXO5KS
CLIENT_SECRET:4F3YJSXCMZRM2FE0N3OZ5PQRPAQGP0ITB2PJQ2M1MLV2EYY1


#### Define function to get nearby venues from Foursquare

In [7]:
def getNearbyVenues(outcodes, latitudes, longitudes, radius = 1000, limit = 100):
    
    venues_list=[]
    LIMIT = 100
    
    for outcode, lat, lng in zip(outcodes, latitudes, longitudes):
        print('{}'.format(outcode))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            outcode,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['postcode_district',
                  'postcode_latitude', 
                  'postcode_longitude', 
                  'venue', 
                  'venue_latitude', 
                  'venue_longitude',
                  'venue_category']
    
    return(nearby_venues)

#### Create Dataframe of Edinburgh venues

In [8]:
edinburgh_venues = getNearbyVenues(outcodes = postcodes['postcode_district'], 
                                   latitudes = postcodes['latitude'], 
                                   longitudes = postcodes['longitude'])

EH1
EH2
EH3
EH4
EH5
EH6
EH7
EH8
EH9
EH10
EH11
EH12
EH13
EH14
EH15
EH16
EH17
EH28
EH29
EH30


In [9]:
edinburgh_venues

Unnamed: 0,postcode_district,postcode_latitude,postcode_longitude,venue,venue_latitude,venue_longitude,venue_category
0,EH1,55.950601,-3.193068,Old Town,55.949500,-3.192805,Neighborhood
1,EH1,55.950601,-3.193068,The Milkman,55.950650,-3.191010,Coffee Shop
2,EH1,55.950601,-3.193068,The Devil's Advocate,55.950309,-3.191643,Cocktail Bar
3,EH1,55.950601,-3.193068,The Scott Monument,55.952394,-3.193269,Monument / Landmark
4,EH1,55.950601,-3.193068,East Princes Street Gardens,55.951858,-3.194350,Park
...,...,...,...,...,...,...,...
780,EH30,55.985459,-3.392707,The Little Bakery,55.990988,-3.397183,Café
781,EH30,55.985459,-3.392707,South Queensferry Harbour,55.991136,-3.388842,Harbor / Marina
782,EH30,55.985459,-3.392707,43 Bus Stop Police Station,55.990845,-3.398845,Bus Stop
783,EH30,55.985459,-3.392707,Forth Boat Cruise,55.991053,-3.385704,Boat or Ferry


## 3: Explore data

### (a) Number of results per area

In [10]:
edinburgh_venues.groupby('postcode_district').count()['venue']

postcode_district
EH1     100
EH10     28
EH11     33
EH12     36
EH13     10
EH14      6
EH15     26
EH16     20
EH17     14
EH2     100
EH28      4
EH29      5
EH3     100
EH30     31
EH4       5
EH5      20
EH6      98
EH7      60
EH8      40
EH9      49
Name: venue, dtype: int64

### (b) Venue category

#### Number of unique venue categories

In [11]:
print('There are {} unique venue categories.'.format(len(edinburgh_venues['venue_category'].unique())))

There are 155 unique venue categories.


#### Unique venue category by postcode

In [12]:
edinburgh_venues.groupby(['postcode_district', 'venue_category']).count().groupby('postcode_district').count()['venue']

postcode_district
EH1     53
EH10    19
EH11    24
EH12    24
EH13     9
EH14     6
EH15    24
EH16    15
EH17     8
EH2     61
EH28     4
EH29     5
EH3     56
EH30    19
EH4      5
EH5     16
EH6     51
EH7     29
EH8     28
EH9     27
Name: venue, dtype: int64

## 4a: Cluster Neighbourhoods

### (a) Gather venue category relative frequency

In [13]:
# one hot encoding
edinburgh_onehot = pd.get_dummies(edinburgh_venues[['venue_category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
edinburgh_onehot['postcode_district'] = edinburgh_venues['postcode_district'] 

# move postcode column to the first column
fixed_columns = [edinburgh_onehot.columns[-1]] + list(edinburgh_onehot.columns[:-1])
edinburgh_onehot = edinburgh_onehot[fixed_columns]

edinburgh_onehot

Unnamed: 0,postcode_district,African Restaurant,American Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Garage,...,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Zoo,Zoo Exhibit
0,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,EH30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
781,EH30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
782,EH30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
783,EH30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
edinburgh_grouped = edinburgh_onehot.groupby('postcode_district').mean().reset_index()
edinburgh_grouped

Unnamed: 0,postcode_district,African Restaurant,American Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Garage,...,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Zoo,Zoo Exhibit
0,EH1,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0
1,EH10,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0
2,EH11,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.030303,...,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EH12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.138889
4,EH13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,EH14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,EH15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,...,0.0,0.0,0.0,0.038462,0.038462,0.0,0.0,0.0,0.0,0.0
7,EH16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,EH17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,EH2,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0


### (b) k-Means clustering

In [15]:
# set number of clusters
kclusters = 5

edinburgh_grouped_clustering = edinburgh_grouped.drop('postcode_district', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(edinburgh_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 1, 4, 3, 1, 4, 2, 1])

### (c) Show most common venues

#### Function to return the most common venues

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Get most common venues in each postcode

In [17]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['postcode_district']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
edinburgh_common_venues = pd.DataFrame(columns=columns)
edinburgh_common_venues['postcode_district'] = edinburgh_grouped['postcode_district']

for ind in np.arange(edinburgh_grouped.shape[0]):
    edinburgh_common_venues.iloc[ind, 1:] = return_most_common_venues(edinburgh_grouped.iloc[ind, :], num_top_venues)

# add clustering labels
edinburgh_common_venues.insert(0, 'cluster_labels', np.array(kmeans.labels_, dtype='int'))
edinburgh_common_venues.head()

edinburgh_common_venues

Unnamed: 0,cluster_labels,postcode_district,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,EH1,Pub,Hotel,Bar,Café,Cocktail Bar,Coffee Shop,Beer Bar,Indian Restaurant,Pizza Place,Plaza
1,1,EH10,Café,Park,Wine Shop,Fish & Chips Shop,Pub,Pizza Place,Pool Hall,Cheese Shop,Coffee Shop,Cosmetics Shop
2,2,EH11,Supermarket,Grocery Store,Fast Food Restaurant,Gym,Café,Bus Stop,Furniture / Home Store,Soccer Field,Park,Gas Station
3,1,EH12,Zoo Exhibit,Grocery Store,Coffee Shop,Chinese Restaurant,Pub,Bus Stop,Electronics Store,Shopping Plaza,Scenic Lookout,Sandwich Place
4,4,EH13,Coffee Shop,Rest Area,Hotel,Home Service,Supermarket,Locksmith,Forest,Beer Garden,Fast Food Restaurant,Fish & Chips Shop
5,3,EH14,Discount Store,Supermarket,Golf Course,Multiplex,Train Station,Bakery,Zoo Exhibit,Food Court,Garden,Furniture / Home Store
6,1,EH15,Park,Diner,Bakery,Furniture / Home Store,Gastropub,Pool,Fish & Chips Shop,Fast Food Restaurant,Grocery Store,Supermarket
7,4,EH16,Park,Fast Food Restaurant,Supermarket,Coffee Shop,Hotel,Bookstore,Castle,Pharmacy,Chocolate Shop,Grocery Store
8,2,EH17,Supermarket,Park,Fast Food Restaurant,Grocery Store,Pizza Place,Indian Restaurant,Hotel,Gym / Fitness Center,Electronics Store,English Restaurant
9,1,EH2,Bar,Cocktail Bar,Café,Coffee Shop,Pub,Restaurant,French Restaurant,Scenic Lookout,Sushi Restaurant,Farmers Market


### (d) Merge common venues data with postcode data

In [18]:
edinburgh_merged = postcodes

# merge postcode data with sorted venues to add latitude/longitude for each neighborhood
edinburgh_merged = edinburgh_merged.join(edinburgh_common_venues.set_index('postcode_district'), on='postcode_district')
edinburgh_merged['cluster_labels'] = pd.to_numeric(edinburgh_merged['cluster_labels'], downcast='integer')

#Show dataframe
edinburgh_merged 

Unnamed: 0,postcode_district,coverage,longitude,latitude,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,-3.193068,55.950601,1,Pub,Hotel,Bar,Café,Cocktail Bar,Coffee Shop,Beer Bar,Indian Restaurant,Pizza Place,Plaza
1,EH2,The New Town and central commercial area of Ed...,-3.202204,55.952278,1,Bar,Cocktail Bar,Café,Coffee Shop,Pub,Restaurant,French Restaurant,Scenic Lookout,Sushi Restaurant,Farmers Market
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,-3.206313,55.952185,1,Bar,Cocktail Bar,Coffee Shop,Park,Hotel,French Restaurant,Café,Italian Restaurant,Steakhouse,Farmers Market
3,EH4,Radiates from the older and more central areas...,-3.260869,55.962427,0,Convenience Store,Pub,Grocery Store,Café,Bakery,Zoo Exhibit,Food Court,Garden,Furniture / Home Store,French Restaurant
4,EH5,Based on a village formerly separate from the ...,-3.222157,55.975985,2,Harbor / Marina,Rugby Pitch,Supermarket,Indian Restaurant,Fast Food Restaurant,Park,Café,Soccer Field,Outdoor Supply Store,Grocery Store
5,EH6,"Covers Leith, as well as Newhaven bordering it...",-3.175356,55.97244,1,Bar,Pub,Hotel,Café,Italian Restaurant,Grocery Store,Seafood Restaurant,Pharmacy,Pizza Place,Coffee Shop
6,EH7,The inner city area between central Edinburgh ...,-3.164642,55.960225,1,Café,Bar,Grocery Store,Bakery,Hotel,Pub,Italian Restaurant,Park,Art Gallery,Supermarket
7,EH8,"The inner city Southside, Newington and Canong...",-3.164188,55.948708,1,Café,Hotel,Park,Scenic Lookout,Sandwich Place,Bar,History Museum,Music Venue,Chinese Restaurant,Science Museum
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",-3.185364,55.933011,1,Pub,Bar,Coffee Shop,Bed & Breakfast,Café,Restaurant,Italian Restaurant,Bakery,Grocery Store,Pool
9,EH10,"A corridor along the A702 from Bruntsfield, th...",-3.211108,55.919823,1,Café,Park,Wine Shop,Fish & Chips Shop,Pub,Pizza Place,Pool Hall,Cheese Shop,Coffee Shop,Cosmetics Shop


### (e) Examine cluster sizes

In [19]:
edinburgh_merged['cluster_labels'].value_counts()

1    11
2     3
0     3
4     2
3     1
Name: cluster_labels, dtype: int64

### (f) Map clusters

In [20]:
address = 'Edinburgh, United Kingdom'

geolocator = Nominatim(user_agent="ny_explorer")
edinburghlocation = geolocator.geocode(address)
edinburghlatitude = edinburghlocation.latitude
edinburghlongitude = edinburghlocation.longitude
print('The geograpical coordinates of Edinburgh are {}, {}.'.format(edinburghlatitude, edinburghlongitude))

The geograpical coordinates of Edinburgh are 55.9533456, -3.1883749.


In [21]:
map_clusters = folium.Map(location=[edinburghlatitude, edinburghlongitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(edinburgh_merged['latitude'], edinburgh_merged['longitude'], edinburgh_merged['postcode_district'], edinburgh_merged['cluster_labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 4b: Cluster on general category

### (a) Recode the categories to more general ones

In [22]:
categories = pd.DataFrame({'categories': edinburgh_venues['venue_category'].unique()})
categories.to_csv(r'C:\Users\Fuzzy P\Documents\Education and qualification\IBM - Data Science\9 - Capstone\categories.csv', index = False)
general_categories = pd.read_csv(r'C:\Users\Fuzzy P\Documents\Education and qualification\IBM - Data Science\9 - Capstone\categories2.csv')
edinburgh_venues = edinburgh_venues.merge(general_categories, left_on = 'venue_category', right_on = 'categories')
edinburgh_venues

Unnamed: 0,postcode_district,postcode_latitude,postcode_longitude,venue,venue_latitude,venue_longitude,venue_category,categories,general_categories
0,EH1,55.950601,-3.193068,Old Town,55.949500,-3.192805,Neighborhood,Neighborhood,Neighborhood
1,EH2,55.952278,-3.202204,Old Town,55.949500,-3.192805,Neighborhood,Neighborhood,Neighborhood
2,EH3,55.952185,-3.206313,Dean Village,55.952140,-3.217584,Neighborhood,Neighborhood,Neighborhood
3,EH1,55.950601,-3.193068,The Milkman,55.950650,-3.191010,Coffee Shop,Coffee Shop,Coffee Shop
4,EH1,55.950601,-3.193068,Wellington Coffee,55.953671,-3.197448,Coffee Shop,Coffee Shop,Coffee Shop
...,...,...,...,...,...,...,...,...,...
772,EH16,55.922467,-3.152970,Peffermill Playing Fields,55.930273,-3.156154,Athletics & Sports,Athletics & Sports,Sports field
773,EH16,55.922467,-3.152970,Game,55.926651,-3.164126,Video Game Store,Video Game Store,Highstreet shop
774,EH28,55.929531,-3.390593,Edinburgh International Climbing Arena,55.922033,-3.382739,Rock Climbing Spot,Rock Climbing Spot,Gym / Sports
775,EH30,55.985459,-3.392707,Forth Boat Cruise,55.991053,-3.385704,Boat or Ferry,Boat or Ferry,Harbor/Marina


### (b) Follow same process as in section 4a to cluster postcodes

In [23]:
# one hot encoding
edinburgh_general_onehot = pd.get_dummies(edinburgh_venues[['general_categories']], prefix="", prefix_sep="")

# add postcode column back to dataframe
edinburgh_general_onehot['postcode_district'] = edinburgh_venues['postcode_district'] 

# move postcode column to the first column
fixed_columns_general = [edinburgh_general_onehot.columns[-1]] + list(edinburgh_general_onehot.columns[:-1])
edinburgh_general_onehot = edinburgh_general_onehot[fixed_columns_general]

edinburgh_general_onehot

Unnamed: 0,postcode_district,Church,Cinema,Coffee Shop,Food shop,Grocery,Gym / Sports,Harbor/Marina,Highstreet shop,Hill,...,Restaurant,Scenic Lookout,Shopping Centre,Spa,Sports field,Takeaway,Transport,Venue,Water,Zoo
0,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,EH2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,EH3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,EH1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,EH1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,EH16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
773,EH16,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
774,EH28,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
775,EH30,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
edinburgh_general_grouped = edinburgh_general_onehot.groupby('postcode_district').mean().reset_index()
edinburgh_general_grouped

Unnamed: 0,postcode_district,Church,Cinema,Coffee Shop,Food shop,Grocery,Gym / Sports,Harbor/Marina,Highstreet shop,Hill,...,Restaurant,Scenic Lookout,Shopping Centre,Spa,Sports field,Takeaway,Transport,Venue,Water,Zoo
0,EH1,0.010101,0.010101,0.151515,0.0,0.0,0.0,0.0,0.030303,0.010101,...,0.171717,0.010101,0.0,0.0,0.0,0.0,0.010101,0.020202,0.0,0.0
1,EH10,0.0,0.0,0.178571,0.035714,0.035714,0.071429,0.0,0.071429,0.035714,...,0.107143,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0
2,EH11,0.0,0.0,0.060606,0.030303,0.272727,0.121212,0.0,0.090909,0.0,...,0.090909,0.0,0.0,0.0,0.030303,0.060606,0.121212,0.030303,0.030303,0.0
3,EH12,0.0,0.0,0.111111,0.0,0.111111,0.027778,0.0,0.111111,0.0,...,0.111111,0.027778,0.027778,0.0,0.0,0.083333,0.055556,0.0,0.0,0.166667
4,EH13,0.0,0.0,0.222222,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0
5,EH14,0.0,0.166667,0.0,0.166667,0.166667,0.166667,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0
6,EH15,0.0,0.0,0.038462,0.076923,0.076923,0.038462,0.0,0.115385,0.0,...,0.115385,0.0,0.0,0.0,0.0,0.076923,0.115385,0.0,0.076923,0.0
7,EH16,0.0,0.0,0.1,0.05,0.15,0.0,0.0,0.15,0.0,...,0.05,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0
8,EH17,0.0,0.0,0.0,0.0,0.428571,0.071429,0.0,0.0,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
9,EH2,0.0,0.010204,0.163265,0.020408,0.0,0.0,0.0,0.030612,0.0,...,0.244898,0.020408,0.0,0.0,0.0,0.010204,0.0,0.040816,0.0,0.0


In [25]:
# set number of clusters
kclusters = 5

edinburgh_general_grouped_clustering = edinburgh_general_grouped.drop('postcode_district', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(edinburgh_general_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 3, 3, 0, 3, 3, 3, 2, 1])

In [26]:
# Get most common venues in each PostalCode
# create a new dataframe
edinburgh_venues_sorted_general = pd.DataFrame(columns=columns)
edinburgh_venues_sorted_general['postcode_district'] = edinburgh_general_grouped['postcode_district']

for ind in np.arange(edinburgh_general_grouped.shape[0]):
    edinburgh_venues_sorted_general.iloc[ind, 1:] = return_most_common_venues(edinburgh_general_grouped.iloc[ind, :], num_top_venues)

# add clustering labels
edinburgh_venues_sorted_general.insert(0, 'cluster_labels_general', np.array(kmeans.labels_, dtype='int'))
edinburgh_venues_sorted_general.head()

edinburgh_venues_sorted_general

Unnamed: 0,cluster_labels_general,postcode_district,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,EH1,Pub / Bar,Restaurant,Coffee Shop,Hotel,Monument / Landmark,Park,Museum / Gallery,Highstreet shop,Venue,Neighborhood
1,0,EH10,Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy
2,3,EH11,Grocery,Transport,Gym / Sports,Restaurant,Highstreet shop,Takeaway,Coffee Shop,Park,Venue,Sports field
3,3,EH12,Zoo,Coffee Shop,Highstreet shop,Restaurant,Grocery,Takeaway,Pub / Bar,Transport,Shopping Centre,Scenic Lookout
4,0,EH13,Coffee Shop,Pub / Bar,Takeaway,Hotel,Grocery,Park,Home Service,Rest area,Zoo,Market
5,3,EH14,Cinema,Transport,Food shop,Grocery,Gym / Sports,Highstreet shop,Zoo,Hostel,Monument / Landmark,Market
6,3,EH15,Park,Transport,Restaurant,Highstreet shop,Pub / Bar,Takeaway,Food shop,Grocery,Water,Market
7,3,EH16,Grocery,Park,Highstreet shop,Coffee Shop,Takeaway,Sports field,Restaurant,Hotel,Food shop,Monument / Landmark
8,2,EH17,Grocery,Restaurant,Park,Takeaway,Hotel,Gym / Sports,Hill,Market,Hostel,Home Service
9,1,EH2,Pub / Bar,Restaurant,Coffee Shop,Monument / Landmark,Park,Venue,Museum / Gallery,Highstreet shop,Market,Food shop


In [27]:
edinburgh_merged_general = postcodes

# merge postcode data with Edinburgh data to add latitude/longitude for each neighborhood
edinburgh_merged_general = edinburgh_merged_general.join(edinburgh_venues_sorted_general.set_index('postcode_district'), on='postcode_district')
edinburgh_merged_general['cluster_labels_general'] = pd.to_numeric(edinburgh_merged_general['cluster_labels_general'], downcast='integer')

#Show dataframe
edinburgh_merged_general

Unnamed: 0,postcode_district,coverage,longitude,latitude,cluster_labels_general,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,-3.193068,55.950601,1,Pub / Bar,Restaurant,Coffee Shop,Hotel,Monument / Landmark,Park,Museum / Gallery,Highstreet shop,Venue,Neighborhood
1,EH2,The New Town and central commercial area of Ed...,-3.202204,55.952278,1,Pub / Bar,Restaurant,Coffee Shop,Monument / Landmark,Park,Venue,Museum / Gallery,Highstreet shop,Market,Food shop
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,-3.206313,55.952185,1,Pub / Bar,Restaurant,Coffee Shop,Park,Monument / Landmark,Venue,Food shop,Hotel,Museum / Gallery,Market
3,EH4,Radiates from the older and more central areas...,-3.260869,55.962427,4,Grocery,Coffee Shop,Food shop,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hotel,Hostel
4,EH5,Based on a village formerly separate from the ...,-3.222157,55.975985,3,Sports field,Grocery,Harbor/Marina,Highstreet shop,Restaurant,Transport,Takeaway,Coffee Shop,Food shop,Gym / Sports
5,EH6,"Covers Leith, as well as Newhaven bordering it...",-3.175356,55.97244,1,Restaurant,Pub / Bar,Coffee Shop,Hotel,Food shop,Grocery,Takeaway,Highstreet shop,Pharmacy,Park
6,EH7,The inner city area between central Edinburgh ...,-3.164642,55.960225,1,Pub / Bar,Coffee Shop,Grocery,Restaurant,Food shop,Takeaway,Park,Hotel,Highstreet shop,Monument / Landmark
7,EH8,"The inner city Southside, Newington and Canong...",-3.164188,55.948708,0,Coffee Shop,Park,Pub / Bar,Hotel,Takeaway,Scenic Lookout,Restaurant,Venue,Monument / Landmark,Museum
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",-3.185364,55.933011,1,Pub / Bar,Restaurant,Coffee Shop,Gym / Sports,Hotel,Food shop,Grocery,Venue,Scenic Lookout,Transport
9,EH10,"A corridor along the A702 from Bruntsfield, th...",-3.211108,55.919823,0,Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy


### (c) Examine cluster sizes

In [28]:
edinburgh_merged_general['cluster_labels_general'].value_counts()

1    7
3    6
2    3
0    3
4    1
Name: cluster_labels_general, dtype: int64

### (d) Map clusters

In [29]:
map_clusters_general = folium.Map(location=[edinburghlatitude, edinburghlongitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(edinburgh_merged_general['latitude'], edinburgh_merged_general['longitude'], edinburgh_merged_general['postcode_district'], edinburgh_merged_general['cluster_labels_general']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters_general)
       
map_clusters_general

### (e) Examine clusters

In [30]:
# Cluster 1
edinburgh_merged_general.loc[edinburgh_merged_general['cluster_labels_general'] == 0, edinburgh_merged_general.columns[[0] + [1] + list(range(5, edinburgh_merged_general.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,EH8,"The inner city Southside, Newington and Canong...",Coffee Shop,Park,Pub / Bar,Hotel,Takeaway,Scenic Lookout,Restaurant,Venue,Monument / Landmark,Museum
9,EH10,"A corridor along the A702 from Bruntsfield, th...",Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy
12,EH13,Based on the previously separate village of Co...,Coffee Shop,Pub / Bar,Takeaway,Hotel,Grocery,Park,Home Service,Rest area,Zoo,Market


In [31]:
# Cluster 2
edinburgh_merged_general.loc[edinburgh_merged_general['cluster_labels_general'] == 1, edinburgh_merged_general.columns[[0] + [1] + list(range(5, edinburgh_merged_general.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,Pub / Bar,Restaurant,Coffee Shop,Hotel,Monument / Landmark,Park,Museum / Gallery,Highstreet shop,Venue,Neighborhood
1,EH2,The New Town and central commercial area of Ed...,Pub / Bar,Restaurant,Coffee Shop,Monument / Landmark,Park,Venue,Museum / Gallery,Highstreet shop,Market,Food shop
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,Pub / Bar,Restaurant,Coffee Shop,Park,Monument / Landmark,Venue,Food shop,Hotel,Museum / Gallery,Market
5,EH6,"Covers Leith, as well as Newhaven bordering it...",Restaurant,Pub / Bar,Coffee Shop,Hotel,Food shop,Grocery,Takeaway,Highstreet shop,Pharmacy,Park
6,EH7,The inner city area between central Edinburgh ...,Pub / Bar,Coffee Shop,Grocery,Restaurant,Food shop,Takeaway,Park,Hotel,Highstreet shop,Monument / Landmark
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",Pub / Bar,Restaurant,Coffee Shop,Gym / Sports,Hotel,Food shop,Grocery,Venue,Scenic Lookout,Transport
19,EH30,South Queensferry,Restaurant,Pub / Bar,Coffee Shop,Harbor/Marina,Hotel,Grocery,Transport,Takeaway,Pharmacy,Water


In [32]:
# Cluster 3
edinburgh_merged_general.loc[edinburgh_merged_general['cluster_labels_general'] == 2, edinburgh_merged_general.columns[[0] + [1] + list(range(5, edinburgh_merged_general.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,EH17,Based on the formerly separate village of Gilm...,Grocery,Restaurant,Park,Takeaway,Hotel,Gym / Sports,Hill,Market,Hostel,Home Service
17,EH28,"Newbridge, Ratho",Hotel,Grocery,Gym / Sports,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hostel,Highstreet shop
18,EH29,Kirkliston,Park,Takeaway,Grocery,Gym / Sports,Pub / Bar,Zoo,Home Service,Market,Hotel,Hostel


In [33]:
# Cluster 4
edinburgh_merged_general.loc[edinburgh_merged_general['cluster_labels_general'] == 3, edinburgh_merged_general.columns[[0] + [1] + list(range(5, edinburgh_merged_general.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,EH5,Based on a village formerly separate from the ...,Sports field,Grocery,Harbor/Marina,Highstreet shop,Restaurant,Transport,Takeaway,Coffee Shop,Food shop,Gym / Sports
10,EH11,A corridor (rather thin in shape) along the A7...,Grocery,Transport,Gym / Sports,Restaurant,Highstreet shop,Takeaway,Coffee Shop,Park,Venue,Sports field
11,EH12,A corridor along the A8 from Haymarket through...,Zoo,Coffee Shop,Highstreet shop,Restaurant,Grocery,Takeaway,Pub / Bar,Transport,Shopping Centre,Scenic Lookout
13,EH14,A corridor in south-west Edinburgh starting at...,Cinema,Transport,Food shop,Grocery,Gym / Sports,Highstreet shop,Zoo,Hostel,Monument / Landmark,Market
14,EH15,"Based on Portobello and Duddingston, formerly ...",Park,Transport,Restaurant,Highstreet shop,Pub / Bar,Takeaway,Food shop,Grocery,Water,Market
15,EH16,Based on the formerly separate village of Libe...,Grocery,Park,Highstreet shop,Coffee Shop,Takeaway,Sports field,Restaurant,Hotel,Food shop,Monument / Landmark


In [34]:
# Cluster 5
edinburgh_merged_general.loc[edinburgh_merged_general['cluster_labels_general'] == 4, edinburgh_merged_general.columns[[0] + [1] + list(range(5, edinburgh_merged_general.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,EH4,Radiates from the older and more central areas...,Grocery,Coffee Shop,Food shop,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hotel,Hostel


## 4c: Cluster based on venue location not search location

### (a) Get outcodes closest to each venue

#### Define function to get outcodes for a series of longitudes and latitudes

In [35]:
def getLocationOutcodes(longitudes, latitudes):
    resultlist = []
    for longitude, latitude in zip(longitudes, latitudes):
        url = 'https://api.postcodes.io/outcodes?lon={}&lat={}'.format(longitude, latitude)
        resultlist.append(requests.get(url).json()['result'][0]['outcode'])
    return(resultlist)

#### Add column with the closest outcode to each venue

In [36]:
edinburgh_venues['venue_outcode'] = getLocationOutcodes(edinburgh_venues['venue_longitude'], edinburgh_venues['venue_latitude'])
edinburgh_venues

Unnamed: 0,postcode_district,postcode_latitude,postcode_longitude,venue,venue_latitude,venue_longitude,venue_category,categories,general_categories,venue_outcode
0,EH1,55.950601,-3.193068,Old Town,55.949500,-3.192805,Neighborhood,Neighborhood,Neighborhood,EH1
1,EH2,55.952278,-3.202204,Old Town,55.949500,-3.192805,Neighborhood,Neighborhood,Neighborhood,EH1
2,EH3,55.952185,-3.206313,Dean Village,55.952140,-3.217584,Neighborhood,Neighborhood,Neighborhood,EH3
3,EH1,55.950601,-3.193068,The Milkman,55.950650,-3.191010,Coffee Shop,Coffee Shop,Coffee Shop,EH1
4,EH1,55.950601,-3.193068,Wellington Coffee,55.953671,-3.197448,Coffee Shop,Coffee Shop,Coffee Shop,EH2
...,...,...,...,...,...,...,...,...,...,...
772,EH16,55.922467,-3.152970,Peffermill Playing Fields,55.930273,-3.156154,Athletics & Sports,Athletics & Sports,Sports field,EH16
773,EH16,55.922467,-3.152970,Game,55.926651,-3.164126,Video Game Store,Video Game Store,Highstreet shop,EH16
774,EH28,55.929531,-3.390593,Edinburgh International Climbing Arena,55.922033,-3.382739,Rock Climbing Spot,Rock Climbing Spot,Gym / Sports,EH28
775,EH30,55.985459,-3.392707,Forth Boat Cruise,55.991053,-3.385704,Boat or Ferry,Boat or Ferry,Harbor/Marina,EH30


#### Drop duplicate venues

In [37]:
edinburgh_venues.drop_duplicates(subset=['venue', 'venue_latitude', 'venue_longitude'], keep = 'first', inplace = True)
edinburgh_venues

Unnamed: 0,postcode_district,postcode_latitude,postcode_longitude,venue,venue_latitude,venue_longitude,venue_category,categories,general_categories,venue_outcode
0,EH1,55.950601,-3.193068,Old Town,55.949500,-3.192805,Neighborhood,Neighborhood,Neighborhood,EH1
2,EH3,55.952185,-3.206313,Dean Village,55.952140,-3.217584,Neighborhood,Neighborhood,Neighborhood,EH3
3,EH1,55.950601,-3.193068,The Milkman,55.950650,-3.191010,Coffee Shop,Coffee Shop,Coffee Shop,EH1
4,EH1,55.950601,-3.193068,Wellington Coffee,55.953671,-3.197448,Coffee Shop,Coffee Shop,Coffee Shop,EH2
5,EH1,55.950601,-3.193068,Brew Lab Coffee,55.947132,-3.186685,Coffee Shop,Coffee Shop,Coffee Shop,EH1
...,...,...,...,...,...,...,...,...,...,...
772,EH16,55.922467,-3.152970,Peffermill Playing Fields,55.930273,-3.156154,Athletics & Sports,Athletics & Sports,Sports field,EH16
773,EH16,55.922467,-3.152970,Game,55.926651,-3.164126,Video Game Store,Video Game Store,Highstreet shop,EH16
774,EH28,55.929531,-3.390593,Edinburgh International Climbing Arena,55.922033,-3.382739,Rock Climbing Spot,Rock Climbing Spot,Gym / Sports,EH28
775,EH30,55.985459,-3.392707,Forth Boat Cruise,55.991053,-3.385704,Boat or Ferry,Boat or Ferry,Harbor/Marina,EH30


### (b) Follow process above to cluster postcodes

In [38]:
# one hot encoding
edinburgh_venuecode_onehot = pd.get_dummies(edinburgh_venues[['general_categories']], prefix="", prefix_sep="")

# add postcode column back to dataframe
edinburgh_venuecode_onehot['postcode_district'] = edinburgh_venues['venue_outcode'] 

# move postcode column to the first column
fixed_columns_general = [edinburgh_venuecode_onehot.columns[-1]] + list(edinburgh_venuecode_onehot.columns[:-1])
edinburgh_venuecode_onehot = edinburgh_venuecode_onehot[fixed_columns_general]

edinburgh_venuecode_onehot

Unnamed: 0,postcode_district,Church,Cinema,Coffee Shop,Food shop,Grocery,Gym / Sports,Harbor/Marina,Highstreet shop,Hill,...,Restaurant,Scenic Lookout,Shopping Centre,Spa,Sports field,Takeaway,Transport,Venue,Water,Zoo
0,EH1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,EH3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,EH1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,EH2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EH1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,EH16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
773,EH16,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
774,EH28,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
775,EH30,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
edinburgh_venuecode_grouped = edinburgh_venuecode_onehot.groupby('postcode_district').mean().reset_index()
edinburgh_venuecode_grouped

Unnamed: 0,postcode_district,Church,Cinema,Coffee Shop,Food shop,Grocery,Gym / Sports,Harbor/Marina,Highstreet shop,Hill,...,Restaurant,Scenic Lookout,Shopping Centre,Spa,Sports field,Takeaway,Transport,Venue,Water,Zoo
0,EH1,0.013889,0.0,0.125,0.0,0.0,0.0,0.0,0.013889,0.013889,...,0.222222,0.027778,0.0,0.0,0.0,0.013889,0.013889,0.013889,0.0,0.0
1,EH10,0.0,0.0,0.178571,0.035714,0.035714,0.071429,0.0,0.071429,0.035714,...,0.107143,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0
2,EH11,0.0,0.0,0.060606,0.030303,0.272727,0.121212,0.0,0.090909,0.0,...,0.090909,0.0,0.0,0.0,0.030303,0.060606,0.121212,0.030303,0.030303,0.0
3,EH12,0.0,0.0,0.111111,0.0,0.111111,0.027778,0.0,0.111111,0.0,...,0.111111,0.027778,0.027778,0.0,0.0,0.083333,0.055556,0.0,0.0,0.166667
4,EH13,0.0,0.0,0.222222,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0
5,EH14,0.0,0.166667,0.0,0.166667,0.166667,0.166667,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0
6,EH15,0.0,0.0,0.038462,0.076923,0.076923,0.038462,0.0,0.115385,0.0,...,0.115385,0.0,0.0,0.0,0.0,0.076923,0.115385,0.0,0.076923,0.0
7,EH16,0.0,0.0,0.1,0.05,0.15,0.0,0.0,0.15,0.0,...,0.05,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0
8,EH17,0.0,0.0,0.0,0.0,0.428571,0.071429,0.0,0.0,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
9,EH2,0.0,0.0,0.235294,0.058824,0.0,0.0,0.0,0.029412,0.0,...,0.294118,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0


In [40]:
# set number of clusters
kclusters = 5

edinburgh_venuecode_grouped_clustering = edinburgh_venuecode_grouped.drop('postcode_district', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(edinburgh_venuecode_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 1, 0, 0, 1, 0, 1, 2, 3])

In [43]:
# Get most common venues in each PostalCode
# create a new dataframe
edinburgh_venues_sorted_venuecode = pd.DataFrame(columns=columns)
edinburgh_venues_sorted_venuecode['postcode_district'] = edinburgh_venuecode_grouped['postcode_district']

for ind in np.arange(edinburgh_venuecode_grouped.shape[0]):
    edinburgh_venues_sorted_venuecode.iloc[ind, 1:] = return_most_common_venues(edinburgh_venuecode_grouped.iloc[ind, :], num_top_venues)

# add clustering labels
edinburgh_venues_sorted_venuecode.insert(1, 'cluster_labels_venuecode', np.array(kmeans.labels_, dtype='int'))
edinburgh_venues_sorted_venuecode.head()

edinburgh_venues_sorted_venuecode

Unnamed: 0,postcode_district,cluster_labels_venuecode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,3,Restaurant,Pub / Bar,Coffee Shop,Hotel,Museum / Gallery,Park,Monument / Landmark,Scenic Lookout,Museum,Neighborhood
1,EH10,0,Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy
2,EH11,1,Grocery,Transport,Gym / Sports,Restaurant,Highstreet shop,Takeaway,Coffee Shop,Park,Venue,Sports field
3,EH12,0,Zoo,Coffee Shop,Highstreet shop,Restaurant,Grocery,Takeaway,Pub / Bar,Transport,Shopping Centre,Scenic Lookout
4,EH13,0,Coffee Shop,Pub / Bar,Takeaway,Hotel,Grocery,Park,Home Service,Rest area,Zoo,Market
5,EH14,1,Cinema,Transport,Food shop,Grocery,Gym / Sports,Highstreet shop,Zoo,Hostel,Monument / Landmark,Market
6,EH15,0,Park,Transport,Restaurant,Highstreet shop,Pub / Bar,Takeaway,Food shop,Grocery,Water,Market
7,EH16,1,Grocery,Park,Highstreet shop,Coffee Shop,Takeaway,Sports field,Restaurant,Hotel,Food shop,Monument / Landmark
8,EH17,2,Grocery,Restaurant,Park,Takeaway,Hotel,Gym / Sports,Hill,Market,Hostel,Home Service
9,EH2,3,Restaurant,Pub / Bar,Coffee Shop,Park,Food shop,Highstreet shop,Monument / Landmark,Venue,Gym / Sports,Harbor/Marina


In [45]:
edinburgh_merged_venuecode = postcodes

# merge postcode data with Edinburgh data to add latitude/longitude for each neighborhood
edinburgh_merged_venuecode = edinburgh_merged_venuecode.join(edinburgh_venues_sorted_venuecode.set_index('postcode_district'), on='postcode_district')
edinburgh_merged_venuecode['cluster_labels_venuecode'] = pd.to_numeric(edinburgh_merged_venuecode['cluster_labels_venuecode'], downcast='integer')

#Show dataframe
edinburgh_merged_venuecode

Unnamed: 0,postcode_district,coverage,longitude,latitude,cluster_labels_venuecode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,-3.193068,55.950601,3,Restaurant,Pub / Bar,Coffee Shop,Hotel,Museum / Gallery,Park,Monument / Landmark,Scenic Lookout,Museum,Neighborhood
1,EH2,The New Town and central commercial area of Ed...,-3.202204,55.952278,3,Restaurant,Pub / Bar,Coffee Shop,Park,Food shop,Highstreet shop,Monument / Landmark,Venue,Gym / Sports,Harbor/Marina
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,-3.206313,55.952185,3,Pub / Bar,Restaurant,Hotel,Coffee Shop,Food shop,Park,Highstreet shop,Market,Neighborhood,Grocery
3,EH4,Radiates from the older and more central areas...,-3.260869,55.962427,2,Grocery,Coffee Shop,Food shop,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hotel,Hostel
4,EH5,Based on a village formerly separate from the ...,-3.222157,55.975985,1,Sports field,Grocery,Harbor/Marina,Highstreet shop,Restaurant,Transport,Takeaway,Coffee Shop,Food shop,Gym / Sports
5,EH6,"Covers Leith, as well as Newhaven bordering it...",-3.175356,55.97244,3,Restaurant,Pub / Bar,Coffee Shop,Hotel,Food shop,Grocery,Takeaway,Highstreet shop,Pharmacy,Park
6,EH7,The inner city area between central Edinburgh ...,-3.164642,55.960225,0,Coffee Shop,Pub / Bar,Grocery,Restaurant,Food shop,Highstreet shop,Hotel,Museum / Gallery,Sports field,Off licence
7,EH8,"The inner city Southside, Newington and Canong...",-3.164188,55.948708,0,Coffee Shop,Scenic Lookout,Park,Monument / Landmark,Takeaway,Pub / Bar,Venue,Museum,Grocery,Restaurant
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",-3.185364,55.933011,3,Pub / Bar,Restaurant,Coffee Shop,Gym / Sports,Hotel,Food shop,Grocery,Venue,Scenic Lookout,Transport
9,EH10,"A corridor along the A702 from Bruntsfield, th...",-3.211108,55.919823,0,Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy


### (c) Examine cluster sizes

In [46]:
edinburgh_merged_venuecode['cluster_labels_venuecode'].value_counts()

0    7
3    6
1    4
2    2
4    1
Name: cluster_labels_venuecode, dtype: int64

### (d) Map clusters

In [47]:
map_clusters_venuecode = folium.Map(location=[edinburghlatitude, edinburghlongitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(edinburgh_merged_venuecode['latitude'], edinburgh_merged_venuecode['longitude'], edinburgh_merged_venuecode['postcode_district'], edinburgh_merged_venuecode['cluster_labels_venuecode']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters_venuecode)
       
map_clusters_venuecode

### (e) Examine clusters

In [48]:
# Cluster 1
edinburgh_merged_venuecode.loc[edinburgh_merged_venuecode['cluster_labels_venuecode'] == 0, edinburgh_merged_venuecode.columns[[0] + [1] + list(range(5, edinburgh_merged_venuecode.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,EH7,The inner city area between central Edinburgh ...,Coffee Shop,Pub / Bar,Grocery,Restaurant,Food shop,Highstreet shop,Hotel,Museum / Gallery,Sports field,Off licence
7,EH8,"The inner city Southside, Newington and Canong...",Coffee Shop,Scenic Lookout,Park,Monument / Landmark,Takeaway,Pub / Bar,Venue,Museum,Grocery,Restaurant
9,EH10,"A corridor along the A702 from Bruntsfield, th...",Coffee Shop,Park,Restaurant,Pub / Bar,Highstreet shop,Off licence,Takeaway,Gym / Sports,Hill,Pharmacy
11,EH12,A corridor along the A8 from Haymarket through...,Zoo,Coffee Shop,Highstreet shop,Restaurant,Grocery,Takeaway,Pub / Bar,Transport,Shopping Centre,Scenic Lookout
12,EH13,Based on the previously separate village of Co...,Coffee Shop,Pub / Bar,Takeaway,Hotel,Grocery,Park,Home Service,Rest area,Zoo,Market
14,EH15,"Based on Portobello and Duddingston, formerly ...",Park,Transport,Restaurant,Highstreet shop,Pub / Bar,Takeaway,Food shop,Grocery,Water,Market
18,EH29,Kirkliston,Park,Takeaway,Grocery,Gym / Sports,Pub / Bar,Zoo,Home Service,Market,Hotel,Hostel


In [49]:
# Cluster 2
edinburgh_merged_venuecode.loc[edinburgh_merged_venuecode['cluster_labels_venuecode'] == 1, edinburgh_merged_venuecode.columns[[0] + [1] + list(range(5, edinburgh_merged_venuecode.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,EH5,Based on a village formerly separate from the ...,Sports field,Grocery,Harbor/Marina,Highstreet shop,Restaurant,Transport,Takeaway,Coffee Shop,Food shop,Gym / Sports
10,EH11,A corridor (rather thin in shape) along the A7...,Grocery,Transport,Gym / Sports,Restaurant,Highstreet shop,Takeaway,Coffee Shop,Park,Venue,Sports field
13,EH14,A corridor in south-west Edinburgh starting at...,Cinema,Transport,Food shop,Grocery,Gym / Sports,Highstreet shop,Zoo,Hostel,Monument / Landmark,Market
15,EH16,Based on the formerly separate village of Libe...,Grocery,Park,Highstreet shop,Coffee Shop,Takeaway,Sports field,Restaurant,Hotel,Food shop,Monument / Landmark


In [50]:
# Cluster 3
edinburgh_merged_venuecode.loc[edinburgh_merged_venuecode['cluster_labels_venuecode'] == 2, edinburgh_merged_venuecode.columns[[0] + [1] + list(range(5, edinburgh_merged_venuecode.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,EH4,Radiates from the older and more central areas...,Grocery,Coffee Shop,Food shop,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hotel,Hostel
16,EH17,Based on the formerly separate village of Gilm...,Grocery,Restaurant,Park,Takeaway,Hotel,Gym / Sports,Hill,Market,Hostel,Home Service


In [51]:
# Cluster 4
edinburgh_merged_venuecode.loc[edinburgh_merged_venuecode['cluster_labels_venuecode'] == 3, edinburgh_merged_venuecode.columns[[0] + [1] + list(range(5, edinburgh_merged_venuecode.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,EH1,Mostly consists of Edinburgh's Old Town. Also ...,Restaurant,Pub / Bar,Coffee Shop,Hotel,Museum / Gallery,Park,Monument / Landmark,Scenic Lookout,Museum,Neighborhood
1,EH2,The New Town and central commercial area of Ed...,Restaurant,Pub / Bar,Coffee Shop,Park,Food shop,Highstreet shop,Monument / Landmark,Venue,Gym / Sports,Harbor/Marina
2,EH3,An odd shaped area surrounding EH1 and EH2 to ...,Pub / Bar,Restaurant,Hotel,Coffee Shop,Food shop,Park,Highstreet shop,Market,Neighborhood,Grocery
5,EH6,"Covers Leith, as well as Newhaven bordering it...",Restaurant,Pub / Bar,Coffee Shop,Hotel,Food shop,Grocery,Takeaway,Highstreet shop,Pharmacy,Park
8,EH9,"The inner city, Marchmont and Grange, Blackfor...",Pub / Bar,Restaurant,Coffee Shop,Gym / Sports,Hotel,Food shop,Grocery,Venue,Scenic Lookout,Transport
19,EH30,South Queensferry,Restaurant,Pub / Bar,Coffee Shop,Harbor/Marina,Hotel,Grocery,Transport,Takeaway,Pharmacy,Water


In [52]:
# Cluster 5
edinburgh_merged_venuecode.loc[edinburgh_merged_venuecode['cluster_labels_venuecode'] == 4, edinburgh_merged_venuecode.columns[[0] + [1] + list(range(5, edinburgh_merged_venuecode.shape[1]))]]

Unnamed: 0,postcode_district,coverage,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,EH28,"Newbridge, Ratho",Hotel,Grocery,Gym / Sports,Pub / Bar,Zoo,Home Service,Monument / Landmark,Market,Hostel,Highstreet shop
