# IBM Data Science Capstone Project

## T Farrington

## Canberra Suburbs: Finding a place to live

##### 26th November 2019

### Part 0 - Initialise Notebook

In this section we import all the modules required for this project

#### Import modules

In [2]:
print('Importing modules ...')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from geopy.geocoders import Nominatim
import geocoder
import requests

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize, scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

print('Modules imported')

Importing modules ...
Modules imported


### Part 1 - Data Preparation 

#### Read Canberra suburbs CSV file into DataFrame

In [3]:
# Source file
canberra_csv = r'Canberra_Suburbs.csv'
# Read CSV into dataframe
df = pd.read_csv(canberra_csv)
# Sort by Suburb name (alph ASC)
df.sort_values(by=['Suburb'], inplace = True)
df.reset_index(drop=True, inplace = True)
df.head()

Unnamed: 0,Suburb,Longitude,Latitude
0,Acton,149.112771,-35.281319
1,Ainslie,149.148232,-35.2636
2,Amaroo,149.127417,-35.168831
3,Aranda,149.080925,-35.257771
4,Banks,149.100662,-35.471861


##### Check dataframe

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 3 columns):
Suburb       116 non-null object
Longitude    116 non-null float64
Latitude     116 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.8+ KB


116 Suburbs  
no nulls  
Suburb name is type 'object'  
Longitude and Latitude are type 'float64'  
All as expected, no further cleaning required

#### Get Domain Suburb IDs

###### Domain api credentials

In [5]:
domain_key = 'XXXXXXXXXXXX'

##### Make first set of API calls

In [6]:
S_names = df['Suburb'].tolist()
i=0
responses = []
for name in S_names:
    url = r'https://api.domain.com.au/v1/addressLocators?api_key={}&searchLevel=Suburb&suburb={}&state=ACT'.format(
        domain_key, 
        name)
    
    response = requests.get(url).json()[0]["ids"][0]
    response["level"] = name
    responses.append(response)

suburbs = pd.DataFrame(responses)
suburbs.rename(columns={'level' : 'Suburb', 'id' : 'Domain ID'}, inplace = True)
suburbs = suburbs[['Suburb', 'Domain ID']]
suburbs.head()

Unnamed: 0,Suburb,Domain ID
0,Acton,61
1,Ainslie,71
2,Amaroo,91
3,Aranda,101
4,Banks,131


##### Check Dataframe 

In [7]:
suburbs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 2 columns):
Suburb       116 non-null object
Domain ID    116 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


##### Merge base dataframes

In [8]:
canberra = pd.merge(df, suburbs, on=['Suburb'])
canberra = canberra[['Suburb', 'Domain ID', 'Longitude', 'Latitude']]
canberra.head(20)

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude
0,Acton,61,149.112771,-35.281319
1,Ainslie,71,149.148232,-35.2636
2,Amaroo,91,149.127417,-35.168831
3,Aranda,101,149.080925,-35.257771
4,Banks,131,149.100662,-35.471861
5,Barton,141,149.137112,-35.307921
6,Beard,6451,149.211188,-35.342198
7,Belconnen,171,149.068356,-35.235251
8,Bonner,3041,149.142229,-35.157298
9,Bonython,191,149.077786,-35.434445


### Get housing data

#### Get median house prices for last 5 years

In [9]:
suburb_ids = canberra['Domain ID'].tolist()
responses = []
median_price_2014 = []
median_price_2019 = []
growth = []

for suburb_id in suburb_ids:
    url = 'https://api.domain.com.au/v1/suburbPerformanceStatistics?api_key={}&state=ACT&suburbId={}&propertyCategory=house&chronologicalSpan=12&tPlusFrom=1&tPlusTo=6&values=MedianSoldPrice'.format(
        domain_key,
        suburb_id)
    try:
        responses.append(requests.get(url).json())
    except:
        responses.append(np.nan)

In [10]:
for response in responses:
    if str(response).startswith('{'):
        # Extract -5yr prices
        price2014 = response["series"]['seriesInfo'][0]['values']['medianSoldPrice']
        if price2014 == None:
            median_price_2014.append(np.nan)
        else:
            median_price_2014.append(price2014)
        # Extract current prices 
        price2019 = response["series"]['seriesInfo'][-1]['values']['medianSoldPrice']
        if price2019 == None:
            median_price_2019.append(np.nan)
        else:
            median_price_2019.append(price2019)
        # Calculate growth
        if price2014 == None or price2014 == 0 or price2019 == None:
            growth.append(np.nan)
        else:
            growth.append(round(float(price2019)/float(price2014) * 100, 2))
    else:
        median_price_2014.append(np.nan)
        median_price_2019.append(np.nan)
        growth.append(np.nan)

In [11]:
# Build result dataframe
house_data = list(zip(suburb_ids, median_price_2014, median_price_2019, growth))
house_data
canberra_houses = pd.DataFrame(house_data, columns = ['Domain ID', 'Median Price 2014', 'Median Price 2019', '5 yr Growth (%)'])
canberra_houses.head(20)

Unnamed: 0,Domain ID,Median Price 2014,Median Price 2019,5 yr Growth (%)
0,61,,,
1,71,695000.0,975000.0,140.29
2,91,544000.0,687000.0,126.29
3,101,662000.0,960000.0,145.02
4,131,470000.0,485000.0,103.19
5,141,,,
6,6451,,,
7,171,,420000.0,
8,3041,533000.0,661000.0,124.02
9,191,528000.0,643000.0,121.78


#### Merge dataframes

In [12]:
canberra = pd.merge(canberra, canberra_houses, on=['Domain ID'])
canberra.head(20)

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%)
0,Acton,61,149.112771,-35.281319,,,
1,Ainslie,71,149.148232,-35.2636,695000.0,975000.0,140.29
2,Amaroo,91,149.127417,-35.168831,544000.0,687000.0,126.29
3,Aranda,101,149.080925,-35.257771,662000.0,960000.0,145.02
4,Banks,131,149.100662,-35.471861,470000.0,485000.0,103.19
5,Barton,141,149.137112,-35.307921,,,
6,Beard,6451,149.211188,-35.342198,,,
7,Belconnen,171,149.068356,-35.235251,,420000.0,
8,Bonner,3041,149.142229,-35.157298,533000.0,661000.0,124.02
9,Bonython,191,149.077786,-35.434445,528000.0,643000.0,121.78


In [13]:
canberra.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116 entries, 0 to 115
Data columns (total 7 columns):
Suburb               116 non-null object
Domain ID            116 non-null int64
Longitude            116 non-null float64
Latitude             116 non-null float64
Median Price 2014    87 non-null float64
Median Price 2019    96 non-null float64
5 yr Growth (%)      86 non-null float64
dtypes: float64(5), int64(1), object(1)
memory usage: 7.2+ KB


In [14]:
canberra.describe()

Unnamed: 0,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%)
count,116.0,116.0,116.0,87.0,96.0,86.0
mean,1575.568966,149.097883,-35.296502,605057.5,787916.7,128.451047
std,1905.070759,0.047611,0.083825,204264.2,286515.8,10.593759
min,61.0,148.925092,-35.508979,402000.0,420000.0,101.54
25%,618.5,149.064579,-35.352567,481500.0,600000.0,121.8175
50%,1036.0,149.100564,-35.297153,544000.0,714000.0,128.64
75%,1531.0,149.133244,-35.226085,655000.0,869750.0,134.62
max,7841.0,149.227164,-35.153357,1900000.0,2210000.0,154.39


In [31]:
canberra_sorted = canberra[(canberra['Median Price 2019'].between((canberra['Median Price 2019'].mean() * 0.8), (canberra['Median Price 2019'].mean() * 1.2))) & (canberra['5 yr Growth (%)'] >= canberra['5 yr Growth (%)'].mean())] 
#canberra_sorted['5 yr Growth (%)'].fillna(100, inplace = True)
canberra_house = canberra_sorted.sort_values(['5 yr Growth (%)', 'Median Price 2019'], ascending = [False, True])
canberra_house.head(10)

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%)
114,Wright,7121,149.033242,-35.320674,580000.0,880000.0,151.72
23,Cook,441,149.066321,-35.260416,529000.0,765000.0,144.61
42,Fraser,661,149.045276,-35.191903,500000.0,723000.0,144.6
74,Mawson,1141,149.100467,-35.363007,545000.0,760000.0,139.45
81,Narrabundah,1211,149.148882,-35.335116,660000.0,920000.0,139.39
31,Duffy,551,149.033466,-35.334721,545000.0,759000.0,139.27
87,Oxley,1331,149.078932,-35.409159,476000.0,655000.0,137.61
41,Franklin,2301,149.143494,-35.197892,530000.0,725000.0,136.79
64,Kaleen,951,149.108439,-35.226296,530000.0,725000.0,136.79
52,Hackett,761,149.162325,-35.250558,663000.0,899000.0,135.6


### Suburb amenities

For this part the FourSquare venue api will be used.  

The returned venue data will then be used to build a profile of each suburb. 

##### Foursquare initialisation

FourSquare credentials

In [32]:
client_id = 'XXXXXXXXXXXX'
client_secret = 'XXXXXXXXXXXX'
version = 20180605

##### Define venue function

In [38]:
def getVenues(suburbs, lats, longs, radius = 1000, limit = 100):
    '''
    return up to 100 venues for each suburb
    limited by radius from suburb centroid
    and category list
    '''
    # initialise list
    venues_list = []
    # loop through suburbs
    for suburb, lat, long in zip(suburbs, lats, longs):
        
        print(f'Searching {suburb} ...', end="\r")
        
        # create query
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id,
            client_secret,
            version,
            lat,
            long,
            radius,
            #category_list,
            limit)
        
        # execute query
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
            #print(results)
        
            # clean query response
            venues_list.append([(
                suburb, 
                lat, 
                long, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            print(f'No results for {suburb}')
        
    # create dataframe
    venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venues.columns = ['Suburb', 
                  'Suburb Latitude', 
                  'Suburb Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return venues    

#### Search Canberra by suburb

In [39]:
#category_list = ['4bf58dd8d48988d1f9941735',

canberra_venues = getVenues(suburbs=canberra['Suburb'],
                          lats=canberra['Latitude'],
                          longs=canberra['Longitude'],
                          #category_list=[]
                          radius = 1000)
print(canberra_venues.shape)
canberra_venues.head()

(1448, 7) Yarralumla ...e ....


Unnamed: 0,Suburb,Suburb Latitude,Suburb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Acton,-35.281319,149.112771,Australian National Botanic Gardens,-35.27805,149.109428,Botanical Garden
1,Acton,-35.281319,149.112771,Llewellyn Hall,-35.280604,149.123442,Concert Hall
2,Acton,-35.281319,149.112771,National Film & Sound Archive,-35.283131,149.121143,Museum
3,Acton,-35.281319,149.112771,Monster Kitchen and Bar,-35.285122,149.122547,Hotel Bar
4,Acton,-35.281319,149.112771,BrodDogs,-35.278428,149.122443,Food Truck


#### Analysing each suburb

In [40]:
print('There are {} unique venue categories in Canberra.'.format(len(canberra_venues['Venue Category'].unique())))

# one-hot-encoding venue categories
canberra_encoded = pd.get_dummies(canberra_venues[['Venue Category']], prefix="", prefix_sep="")
# add postcodes back
canberra_encoded['Suburb'] = canberra_venues['Suburb']
# and reorganise column order
fixed_columns = [canberra_encoded.columns[-1]] + list(canberra_encoded.columns[:-1])
canberra_encoded = canberra_encoded[fixed_columns]
print(f'Encoded dataframe shape: {canberra_encoded.shape}')
# aggregate encoded venues by postcode
canberra_grouped = canberra_encoded.groupby('Suburb').mean().reset_index()
print(f'Grouped dataframe shape: {canberra_grouped.shape}')
canberra_grouped

There are 207 unique venue categories in Canberra.
Encoded dataframe shape: (1448, 208)
Grouped dataframe shape: (111, 208)


Unnamed: 0,Suburb,Airport,Airport Lounge,Airport Terminal,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,Auto Dealership,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Store,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Yoga Studio
0,Acton,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,Ainslie,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.100000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,Amaroo,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,Aranda,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,Banks,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
5,Barton,0.000000,0.000000,0.0,0.050000,0.000000,0.033333,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
6,Beard,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.2,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,Belconnen,0.000000,0.000000,0.0,0.014085,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.014085,0.028169,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
8,Bonner,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
9,Bonython,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [47]:
del suburb_venues

#### Top 10 venues per postcode

In [49]:
def return_most_common_venues(row, n):
    '''
    returns top n most frequent venues in descending order
    '''
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:n]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
suburb_venues = pd.DataFrame(columns=columns)
suburb_venues['Suburb'] = canberra_grouped['Suburb']

for ind in np.arange(canberra_grouped.shape[0]):
    suburb_venues.iloc[ind, 1:] = return_most_common_venues(canberra_grouped.iloc[ind, :], num_top_venues)

# view shape and head of postcode_venue dataframe
print(f'Suburb_venues shape: {suburb_venues.shape}')
suburb_venues.head()


Suburb_venues shape: (111, 11)


Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Acton,Food Truck,Café,River,Museum,Botanical Garden,Coffee Shop,Concert Hall,Science Museum,Plaza,Sandwich Place
1,Ainslie,Sports Club,Café,Shopping Plaza,Grocery Store,Bakery,Fish & Chips Shop,Australian Restaurant,Business Service,Hotel,Pub
2,Amaroo,Supermarket,Indian Restaurant,Lake,Grocery Store,Shopping Plaza,Filipino Restaurant,Fountain,Football Stadium,Food Truck,Food Court
3,Aranda,Café,Middle Eastern Restaurant,Nature Preserve,Chinese Restaurant,Thrift / Vintage Store,Yoga Studio,Fried Chicken Joint,Fountain,Football Stadium,Food Truck
4,Banks,Pizza Place,Gym / Fitness Center,Construction & Landscaping,Grocery Store,Trail,Yoga Studio,Fast Food Restaurant,Football Stadium,Food Truck,Food Court


#### Categorise suburbs into clusters

In [50]:
# really should optimise for k but going with 10 for now
# number of clusters
k = 10

# prep dataframe for clustering
canberra_clustering = canberra_grouped.drop('Suburb', 1)
# set-up and fit k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(canberra_clustering)

In [51]:
#del canberra_suburbs
# add cluster labels into suburb_venues dataframe
suburb_venues.insert(0, 'Cluster Labels', kmeans.labels_)
# merge canberra_grouped with canberra_data to add latitude/longitude for each neighborhood
canberra_suburbs = pd.merge(canberra, suburb_venues, on=['Suburb'])
#canberra_suburbs = canberra_suburbs.join(suburb_venues.set_index('Suburb'), on='Suburb')
#canberra_suburbs = canberra_suburbs.join(suburb_venues, on='Suburb')
suburb_venues.dropna(inplace = True)

In [52]:
# check results
canberra_suburbs

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Acton,61,149.112771,-35.281319,,,,2,Food Truck,Café,River,Museum,Botanical Garden,Coffee Shop,Concert Hall,Science Museum,Plaza,Sandwich Place
1,Ainslie,71,149.148232,-35.263600,695000.0,975000.0,140.29,2,Sports Club,Café,Shopping Plaza,Grocery Store,Bakery,Fish & Chips Shop,Australian Restaurant,Business Service,Hotel,Pub
2,Amaroo,91,149.127417,-35.168831,544000.0,687000.0,126.29,8,Supermarket,Indian Restaurant,Lake,Grocery Store,Shopping Plaza,Filipino Restaurant,Fountain,Football Stadium,Food Truck,Food Court
3,Aranda,101,149.080925,-35.257771,662000.0,960000.0,145.02,2,Café,Middle Eastern Restaurant,Nature Preserve,Chinese Restaurant,Thrift / Vintage Store,Yoga Studio,Fried Chicken Joint,Fountain,Football Stadium,Food Truck
4,Banks,131,149.100662,-35.471861,470000.0,485000.0,103.19,8,Pizza Place,Gym / Fitness Center,Construction & Landscaping,Grocery Store,Trail,Yoga Studio,Fast Food Restaurant,Football Stadium,Food Truck,Food Court
5,Barton,141,149.137112,-35.307921,,,,2,Café,Thai Restaurant,Hotel,Coffee Shop,Art Gallery,Italian Restaurant,History Museum,Pizza Place,Bakery,Burger Joint
6,Beard,6451,149.211188,-35.342198,,,,9,Home Service,Sports Bar,Auto Dealership,Construction & Landscaping,Yoga Studio,Filipino Restaurant,Fountain,Football Stadium,Food Truck,Food Court
7,Belconnen,171,149.068356,-35.235251,,420000.0,,2,Café,Gym,Coffee Shop,Burrito Place,Electronics Store,Fast Food Restaurant,Dessert Shop,Bus Station,Vietnamese Restaurant,Park
8,Bonner,3041,149.142229,-35.157298,533000.0,661000.0,124.02,2,Supermarket,Movie Theater,Café,Shopping Mall,Film Studio,Fountain,Football Stadium,Food Truck,Food Court,Food & Drink Shop
9,Bonython,191,149.077786,-35.434445,528000.0,643000.0,121.78,1,Cupcake Shop,Gym,Yoga Studio,Fast Food Restaurant,Fountain,Football Stadium,Food Truck,Food Court,Food & Drink Shop,Flower Shop


In [58]:
order = [2,1,8,6,5,4,3,7,9,0]
canberra_afinal = pd.merge(canberra_house,suburb_venues,on = ['Suburb'], how = 'inner')
canberra_afinal['Cluster Labels'] = pd.Categorical(canberra_afinal['Cluster Labels'], order)
canberra_final = canberra_afinal.sort_values(['Cluster Labels', '5 yr Growth (%)', 'Median Price 2019'], ascending = [True, False, True])
canberra_final.head(10)

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Cook,441,149.066321,-35.260416,529000.0,765000.0,144.61,2,Café,Gym,Bar,Sports Bar,Pizza Place,Supermarket,Fruit & Vegetable Store,Liquor Store,Grocery Store,Flea Market
4,Narrabundah,1211,149.148882,-35.335116,660000.0,920000.0,139.39,2,Hotel,Golf Course,Motel,Gym,German Restaurant,Thai Restaurant,Supermarket,Mediterranean Restaurant,Café,Baseball Stadium
5,Oxley,1331,149.078932,-35.409159,476000.0,655000.0,137.61,2,Locksmith,Dog Run,Thai Restaurant,Gastropub,Yoga Studio,Film Studio,Fountain,Football Stadium,Food Truck,Food Court
6,Franklin,2301,149.143494,-35.197892,530000.0,725000.0,136.79,2,Ice Cream Shop,Bus Station,Spa,Health & Beauty Service,Café,Park,Grocery Store,Thai Restaurant,Theme Park Ride / Attraction,Filipino Restaurant
8,Hackett,761,149.162325,-35.250558,663000.0,899000.0,135.6,2,Bus Station,Café,Thai Restaurant,Grocery Store,Trail,Yoga Studio,Filipino Restaurant,Football Stadium,Food Truck,Food Court
11,Downer,541,149.143269,-35.242505,605000.0,816000.0,134.88,2,Café,Coffee Shop,Sandwich Place,Thai Restaurant,Pool,Department Store,Hostel,Middle Eastern Restaurant,Vegetarian / Vegan Restaurant,Grocery Store
12,Dickson,531,149.140231,-35.254024,650000.0,875000.0,134.62,2,Café,Chinese Restaurant,Asian Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Hotel,Korean Restaurant,Vietnamese Restaurant,Japanese Restaurant,Sandwich Place
13,Farrer,611,149.102349,-35.376256,650000.0,875000.0,134.62,2,Burger Joint,Rugby Pitch,Café,Grocery Store,Film Studio,French Restaurant,Fountain,Football Stadium,Food Truck,Food Court
17,Lyons,1071,149.074145,-35.34009,615000.0,820000.0,133.33,2,Café,Sports Bar,Italian Restaurant,Skating Rink,Gym / Fitness Center,Grocery Store,Filipino Restaurant,Football Stadium,Food Truck,Food Court
20,Gungahlin,751,149.137635,-35.186028,542000.0,714000.0,131.73,2,Café,Supermarket,Coffee Shop,Gym,Ice Cream Shop,Thai Restaurant,Department Store,Pub,Dumpling Restaurant,Fast Food Restaurant


In [297]:
# Cluster labels are not integers so something's wrong
#print('Cluster labels: {}'.format(canberra_suburbs['Cluster Labels'].unique()))
# Labels include NaN values for postcodes where no 4square results were returned
# therefore could either drop these postcodes or
# set NaN values to another integer value, e.g. 99, to include
# all postcodes in output but clearly indicating no venue results
# dropping rows with no venues
#canberra_suburbs.dropna(axis = 0, how ='any', inplace = True)
#canberra_suburbs=canberra_suburbs.astype({'Cluster Labels': int})
#print('Cleaned cluster labels: {}'.format(canberra_suburbs['Cluster Labels'].unique()))
#canberra_suburbs.head()

In [73]:
canberra_final.loc[canberra_final['Suburb'] == 'Kingston']

Unnamed: 0,Suburb,Domain ID,Longitude,Latitude,Median Price 2014,Median Price 2019,5 yr Growth (%),Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


### Create map

In [56]:
# Canberra map intial centre
c_lat, c_long = -35.2930556 , 149.126944

# create map
map_clusters = folium.Map(width=1000,height=1000,location=[c_lat, c_long], zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canberra_suburbs['Latitude'], canberra_suburbs['Longitude'], canberra_suburbs['Suburb'], canberra_suburbs['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Create final map

In [74]:
# create map
map_final = folium.Map(width=970,height=590,location=[c_lat, c_long], zoom_start=11)
# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canberra_final['Latitude'], canberra_final['Longitude'], canberra_final['Suburb'], canberra_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_final)
       
map_final