# Segmenting and Clustering Wards in Cambridge, UK

## Import all the libraries needed

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    scipy-1.3.2                |   py36h921218d_0        18.0 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    numpy-1.17.3               |   py36h95a1406_0       

## Download and explore the Dataset from https://mapit.mysociety.org/area/2263/children

In [2]:
!wget -q -O 'Cambridge_data.json' https://mapit.mysociety.org/area/2263/children
print('Data downloaded!')

Data downloaded!


## Load and explore the data

In [3]:
with open('Cambridge_data.json') as json_data:
    Cambridge_data = json.load(json_data)

In [4]:
Cambridge_data

{'2962': {'parent_area': 2263,
  'generation_high': 37,
  'all_names': {},
  'id': 2962,
  'codes': {'ons': '12UBFQ', 'gss': 'E05002702', 'unit_id': '1289'},
  'name': 'Abbey',
  'country': 'E',
  'type_name': 'District council ward',
  'generation_low': 1,
  'country_name': 'England',
  'type': 'DIW'},
 '2957': {'parent_area': 2263,
  'generation_high': 37,
  'all_names': {},
  'id': 2957,
  'codes': {'ons': '12UBFR', 'gss': 'E05002703', 'unit_id': '1330'},
  'name': 'Arbury',
  'country': 'E',
  'type_name': 'District council ward',
  'generation_low': 1,
  'country_name': 'England',
  'type': 'DIW'},
 '2953': {'parent_area': 2263,
  'generation_high': 37,
  'all_names': {},
  'id': 2953,
  'codes': {'ons': '12UBFS', 'gss': 'E05002704', 'unit_id': '1331'},
  'name': 'Castle',
  'country': 'E',
  'type_name': 'District council ward',
  'generation_low': 1,
  'country_name': 'England',
  'type': 'DIW'},
 '2961': {'parent_area': 2263,
  'generation_high': 37,
  'all_names': {},
  'id': 

## Tranform the data into a *pandas* dataframe

Transform the data of nested Python dictionaries into a _pandas_ dataframe

In [5]:
# define the dataframe columns
column_names = ['Ward', 'Latitude', 'Longitude'] 

# instantiate the dataframe
wards = pd.DataFrame(columns=column_names)

In [6]:
wards

Unnamed: 0,Ward,Latitude,Longitude


Loop through the dictionary and fill the dataframe _wards_ one row at a time

In [7]:
for key in Cambridge_data:
    ward = Cambridge_data[key]['name']
    address = '{}, Cambridge, UK'.format(ward)
    geolocator = Nominatim(user_agent="cambridge_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    wards=wards.append({'Ward':ward,
                        'Latitude':latitude,
                        'Longitude':longitude},ignore_index=True)

In [8]:
wards

Unnamed: 0,Ward,Latitude,Longitude
0,Abbey,52.204406,0.122944
1,Arbury,52.225036,0.128007
2,Castle,52.203482,0.123582
3,Cherry Hinton,52.187843,0.175241
4,Coleridge,52.192661,0.144213
5,East Chesterton,52.222288,0.144592
6,King's Hedges,52.229237,0.135074
7,Market,52.203482,0.123582
8,Newnham,52.196542,0.107044
9,Petersfield,52.19981,0.135933


In [9]:
wards.shape

(14, 3)

### Use geopy library to get the latitude and longitude values of Cambridge, UK

In [10]:
address = 'Cambridge, UK'
geolocator = Nominatim(user_agent="cambridge_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [11]:
print('The geograpical coordinate of Cambridge,UK are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Cambridge,UK are 52.2034823, 0.1235817.


## Create a map of Cambridge with wards superimposed on top

In [12]:
# create map of Cambridge using latitude and longitude values
map_cambridge = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, ward in zip(wards['Latitude'], wards['Longitude'], wards['Ward']):
    label = '{}'.format(ward)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cambridge)  
    
map_cambridge

Next, start utilizing the Foursquare API to explore the wards and segment them.

## Define Foursquare Credentials and Version

In [13]:
CLIENT_ID = '1KJBD3JC00AXKBV4A3JLHEOYWLKL2BKHLFCHQB1IPWIAA4AI' # your Foursquare ID
CLIENT_SECRET = 'G1PQBEFQ2HENKMG0XW3EJ4NTJPBV24A2U0PKBAM3NTFRCW0J' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1KJBD3JC00AXKBV4A3JLHEOYWLKL2BKHLFCHQB1IPWIAA4AI
CLIENT_SECRET:G1PQBEFQ2HENKMG0XW3EJ4NTJPBV24A2U0PKBAM3NTFRCW0J


### Let's explore the first ward in the dataframe

In [14]:
LIMIT = 100

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Ward', 
                  'Ward Latitude', 
                  'Ward Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
#Cambridge venues
cambridge_venues = getNearbyVenues(names=wards['Ward'],
                                   latitudes=wards['Latitude'],
                                   longitudes=wards['Longitude']
                                  )

Abbey
Arbury
Castle
Cherry Hinton
Coleridge
East Chesterton
King's Hedges
Market
Newnham
Petersfield
Queen Edith's
Romsey
Trumpington
West Chesterton


In [17]:
print(cambridge_venues.shape)
cambridge_venues.head()

(398, 7)


Unnamed: 0,Ward,Ward Latitude,Ward Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abbey,52.204406,0.122944,Arts Picturehouse,52.202927,0.123748,Indie Movie Theater
1,Abbey,52.204406,0.122944,Savino's,52.204327,0.123427,Café
2,Abbey,52.204406,0.122944,Hilton Cambridge City Centre,52.203379,0.121676,Hotel
3,Abbey,52.204406,0.122944,Pint Shop,52.204269,0.119238,Bar
4,Abbey,52.204406,0.122944,John Lewis & Partners,52.203671,0.122583,Department Store


Let's check how many venues were returned for each ward

In [18]:
cambridge_venues.groupby('Ward').count()

Unnamed: 0_level_0,Ward Latitude,Ward Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Ward,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbey,100,100,100,100,100,100
Arbury,4,4,4,4,4,4
Castle,100,100,100,100,100,100
Cherry Hinton,6,6,6,6,6,6
Coleridge,8,8,8,8,8,8
East Chesterton,4,4,4,4,4,4
King's Hedges,7,7,7,7,7,7
Market,100,100,100,100,100,100
Newnham,4,4,4,4,4,4
Petersfield,35,35,35,35,35,35


#### Let's find out how many unique categories can be curated from all the returned venues

In [19]:
print('There are {} uniques categories.'.format(len(cambridge_venues['Venue Category'].unique())))

There are 80 uniques categories.


## Analyse each ward

In [20]:
# one hot encoding
cambridge_onehot = pd.get_dummies(cambridge_venues[['Venue Category']], prefix="", prefix_sep="")

# add ward column back to dataframe
cambridge_onehot['Ward'] = cambridge_venues['Ward'] 

# move neighborhood column to the first column
fixed_columns = [cambridge_onehot.columns[-1]] + list(cambridge_onehot.columns[:-1])
cambridge_onehot = cambridge_onehot[fixed_columns]

cambridge_onehot.head()

Unnamed: 0,Ward,African Restaurant,American Restaurant,Asian Restaurant,Bakery,Bar,Bed & Breakfast,Beer Bar,Bookstore,Breakfast Spot,Brewery,Burger Joint,Bus Station,Café,Candy Store,Castle,Cheese Shop,Chinese Restaurant,Chocolate Shop,Clothing Store,Cocktail Bar,Coffee Shop,Cosmetics Shop,Deli / Bodega,Department Store,Eastern European Restaurant,Electronics Store,English Restaurant,Fast Food Restaurant,Food & Drink Shop,Gastropub,Gift Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Historic Site,History Museum,Hookah Bar,Hostel,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Italian Restaurant,Kitchen Supply Store,Korean Restaurant,Lounge,Market,Mexican Restaurant,Museum,Noodle House,Outdoor Supply Store,Park,Pastry Shop,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Record Shop,Restaurant,Salad Place,Sandwich Place,Science Museum,Seafood Restaurant,Shopping Mall,Soccer Field,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Tourist Information Center,Wings Joint
0,Abbey,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Abbey,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Abbey,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Abbey,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Abbey,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
cambridge_onehot.shape

(398, 81)

### Next, let's group rows by ward and by taking the mean of the frequency of occurrence of each category

In [22]:
cambridge_grouped = cambridge_onehot.groupby('Ward').mean().reset_index()
cambridge_grouped

Unnamed: 0,Ward,African Restaurant,American Restaurant,Asian Restaurant,Bakery,Bar,Bed & Breakfast,Beer Bar,Bookstore,Breakfast Spot,Brewery,Burger Joint,Bus Station,Café,Candy Store,Castle,Cheese Shop,Chinese Restaurant,Chocolate Shop,Clothing Store,Cocktail Bar,Coffee Shop,Cosmetics Shop,Deli / Bodega,Department Store,Eastern European Restaurant,Electronics Store,English Restaurant,Fast Food Restaurant,Food & Drink Shop,Gastropub,Gift Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Historic Site,History Museum,Hookah Bar,Hostel,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Italian Restaurant,Kitchen Supply Store,Korean Restaurant,Lounge,Market,Mexican Restaurant,Museum,Noodle House,Outdoor Supply Store,Park,Pastry Shop,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Record Shop,Restaurant,Salad Place,Sandwich Place,Science Museum,Seafood Restaurant,Shopping Mall,Soccer Field,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Tourist Information Center,Wings Joint
0,Abbey,0.0,0.01,0.0,0.01,0.01,0.0,0.01,0.03,0.01,0.0,0.04,0.0,0.05,0.01,0.0,0.01,0.02,0.01,0.04,0.02,0.06,0.01,0.0,0.01,0.01,0.01,0.03,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.01,0.01,0.05,0.01,0.0,0.01,0.01,0.0,0.01,0.01,0.01,0.02,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.0,0.01,0.07,0.01,0.03,0.0,0.05,0.03,0.01,0.01,0.0,0.01,0.01,0.01,0.04,0.01,0.01,0.01,0.01,0.0
1,Arbury,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Castle,0.0,0.01,0.0,0.01,0.03,0.0,0.01,0.02,0.01,0.0,0.04,0.0,0.04,0.01,0.0,0.0,0.02,0.01,0.03,0.01,0.08,0.01,0.0,0.01,0.01,0.01,0.03,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.02,0.02,0.01,0.05,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.02,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.0,0.01,0.09,0.0,0.01,0.0,0.05,0.03,0.01,0.01,0.0,0.01,0.01,0.0,0.04,0.01,0.02,0.01,0.01,0.0
3,Cherry Hinton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Coleridge,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,East Chesterton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,King's Hedges,0.0,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Market,0.0,0.01,0.0,0.01,0.03,0.0,0.01,0.02,0.01,0.0,0.04,0.0,0.04,0.01,0.0,0.0,0.02,0.01,0.03,0.01,0.08,0.01,0.0,0.01,0.01,0.01,0.03,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.02,0.02,0.01,0.05,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.02,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.0,0.01,0.09,0.0,0.01,0.0,0.05,0.03,0.01,0.01,0.0,0.01,0.01,0.0,0.04,0.01,0.02,0.01,0.01,0.0
8,Newnham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Petersfield,0.057143,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.114286,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.057143,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.028571,0.0,0.0,0.057143,0.0,0.057143,0.0,0.0,0.028571,0.028571,0.028571,0.0,0.057143,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.228571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.028571


In [23]:
cambridge_grouped.shape

(14, 81)

### Let's print each ward along with the top 5 most common venues

In [24]:
num_top_venues = 5

for ward in cambridge_grouped['Ward']:
    print("----"+ward+"----")
    temp = cambridge_grouped[cambridge_grouped['Ward'] == ward].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Abbey----
                venue  freq
0                 Pub  0.07
1         Coffee Shop  0.06
2  Italian Restaurant  0.05
3      Sandwich Place  0.05
4                Café  0.05


----Arbury----
                venue  freq
0     Bed & Breakfast  0.50
1         Bus Station  0.25
2  Chinese Restaurant  0.25
3                Park  0.00
4               Plaza  0.00


----Castle----
                venue  freq
0                 Pub  0.09
1         Coffee Shop  0.08
2      Sandwich Place  0.05
3  Italian Restaurant  0.05
4    Sushi Restaurant  0.04


----Cherry Hinton----
               venue  freq
0          Gastropub  0.17
1           Pharmacy  0.17
2  Indian Restaurant  0.17
3                Pub  0.17
4         Restaurant  0.17


----Coleridge----
         venue  freq
0  Coffee Shop  0.25
1          Gym  0.12
2   Playground  0.12
3          Bar  0.12
4     Platform  0.12


----East Chesterton----
                  venue  freq
0            Playground  0.25
1  Fast Food Restaurant  0.25


### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each ward.

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Ward']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
wards_venues_sorted = pd.DataFrame(columns=columns)
wards_venues_sorted['Ward'] = cambridge_grouped['Ward']

for ind in np.arange(cambridge_grouped.shape[0]):
    wards_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cambridge_grouped.iloc[ind, :], num_top_venues)

wards_venues_sorted.head()

Unnamed: 0,Ward,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abbey,Pub,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Clothing Store,Sushi Restaurant,Burger Joint,Science Museum,Restaurant
1,Arbury,Bed & Breakfast,Chinese Restaurant,Bus Station,Wings Joint,Gastropub,Department Store,Eastern European Restaurant,Electronics Store,English Restaurant,Fast Food Restaurant
2,Castle,Pub,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Burger Joint,Sushi Restaurant,Science Museum,Pizza Place,Bar
3,Cherry Hinton,Pharmacy,Plaza,Indian Restaurant,Pub,Gastropub,Restaurant,Electronics Store,Coffee Shop,Cosmetics Shop,Deli / Bodega
4,Coleridge,Coffee Shop,Playground,Gym,Bar,Bookstore,Park,Platform,English Restaurant,Cosmetics Shop,Deli / Bodega


## Cluster Wards

Run *k*-means to cluster the wards into 5 clusters.

In [27]:
# set number of clusters
kclusters = 5

cambridge_grouped_clustering = cambridge_grouped.drop('Ward', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cambridge_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 2, 2, 2, 3, 4, 2, 4, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [28]:
# add clustering labels
wards_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

cambridge_merged = wards

# merge cambridge_grouped with toronto_data to add latitude/longitude for each neighborhood
cambridge_merged = cambridge_merged.join(wards_venues_sorted.set_index('Ward'), on='Ward')

cambridge_merged.head() # check the last columns!

Unnamed: 0,Ward,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abbey,52.204406,0.122944,2,Pub,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Clothing Store,Sushi Restaurant,Burger Joint,Science Museum,Restaurant
1,Arbury,52.225036,0.128007,0,Bed & Breakfast,Chinese Restaurant,Bus Station,Wings Joint,Gastropub,Department Store,Eastern European Restaurant,Electronics Store,English Restaurant,Fast Food Restaurant
2,Castle,52.203482,0.123582,2,Pub,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Burger Joint,Sushi Restaurant,Science Museum,Pizza Place,Bar
3,Cherry Hinton,52.187843,0.175241,2,Pharmacy,Plaza,Indian Restaurant,Pub,Gastropub,Restaurant,Electronics Store,Coffee Shop,Cosmetics Shop,Deli / Bodega
4,Coleridge,52.192661,0.144213,2,Coffee Shop,Playground,Gym,Bar,Bookstore,Park,Platform,English Restaurant,Cosmetics Shop,Deli / Bodega


Finally, let's visualize the resulting clusters

In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cambridge_merged['Latitude'], cambridge_merged['Longitude'], cambridge_merged['Ward'], cambridge_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters