# Coursera Capstone Project Week 3

## Import necessary libraries

In [1]:
from pandas.io.html import read_html
import pandas as pd 
import numpy as np

## Reading files as a dataframe

In [2]:
page='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table=read_html(page, index_col=0, attrs={"class":"wikitable"})
df=table[0].reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df.shape

(288, 3)

## Dropping "Not Assigned" Values

In [4]:
df['Borough'].replace('Not assigned',np.nan,inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [5]:
df.shape

(211, 3)

## Setting Not Assigned Neighbourhoods to its respective Boroughs

In [6]:
for bor,neigh in zip(df['Borough'],df['Neighbourhood']):
    if neigh=='Not assigned':
        df['Neighbourhood'].replace(neigh,bor,inplace=True)

## Grouping Neighbourhoods with the same Postcode

In [7]:
#Grouping
df=df.groupby(['Postcode','Borough'])['Neighbourhood'].unique().reset_index()

#Removing brackets
neighs=[] 
df_neighd= df['Neighbourhood']
for neigh in df_neighd:
    neighs=np.append(neighs,', '.join(neigh)) 

df['Neighbourhood']=pd.DataFrame(neighs) 
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
df.shape

(103, 3)

## Getting Location coordiantes

In [9]:
path= 'http://cocl.us/Geospatial_data'
df_loc=pd.read_csv(path) 
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df['Latitude']=df_loc['Latitude']
df['Longitude']=df_loc['Longitude']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [11]:
# Installing folium
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata: done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - anaconda/linux-64::conda-build==3.17.8=py36_0
  - anaconda/linux-64::grpcio==1.16.1=py36hf8bcb03_1
  - anaconda/linux-64::keras==2.1.5=py36_0
  - anaconda/linux-64::libarchive==3.3.3=h5d8350f_5
  - anaconda/linux-64::python-libarchive-c==2.8=py36_6
  - anaconda/linux-64::tensorboard==1.8.0=py36hf484d3e_0
  - anaconda/linux-64::tensorflow==1.8.0=h57681fa_0
  - anaconda/linux-64::tensorflow-base==1.8.0=py36h5f64886_0
  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/l

## Creating a map for Toronto

In [12]:
from geopy.geocoders import Nominatim

In [13]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [14]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [15]:
# Creating a new dataframe of only Boroughs containing the word Toronto
df_Tor=df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)

# drop the postcode from the dataframe 
df_Tor.drop('Postcode',axis=1,inplace=True)
df_Tor.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


In [16]:
df_Tor.shape

(38, 4)

In [17]:
Toronto_map=folium.Map(location=[latitude,longitude],zoom_start=10)
for lat, long, bor, neigh in zip(df_Tor['Latitude'],df_Tor['Longitude'],df_Tor['Borough'],df_Tor['Neighbourhood']):
    
    label='{},{}'.format(bor,neigh)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)

Toronto_map

## Exploring venues at each location

In [18]:
import json
import requests
from pandas.io.json import json_normalize

In [19]:
def categories_fix(row):
    
    try:
        categories_list = row['categories']
    except:
        categories_list=row['venue.categories']
    
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']

In [20]:
radius=500
limit=20
raw_venues=[]
Neighs=[]
Boroughs=[]
# Looping through all locations
for lat, long, neigh,bor  in zip(df_Tor['Latitude'],df_Tor['Longitude'],df_Tor['Neighbourhood'],df_Tor['Borough']):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    long, 
    radius, 
    limit)
    
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    raw_venues=np.append(raw_venues,results)
    
    for i in enumerate(results):
        Neighs=np.append(Neighs,neigh)
        Boroughs=np.append(Boroughs,bor)

# Converting the json to dataframe        
raw_venues=json_normalize(raw_venues)
raw_venues['Neighbourhood']=pd.DataFrame(Neighs)
raw_venues['Borough']=pd.DataFrame(Boroughs)

# Selecting relevant columns
select_columns=['Borough','Neighbourhood','venue.name','venue.categories','venue.location.lat','venue.location.lng']    
venues=raw_venues.loc[:,select_columns]
venues['venue.categories']=venues.apply(categories_fix,axis=1)    

# Cleaning columns names
venues.columns=[col.replace('venue.','')for col in venues.columns]
venues.columns=[col.replace('location.','')for col in venues.columns]

print(venues.shape)
venues.head()

NameError: name 'CLIENT_ID' is not defined

In [21]:
venues.groupby('Neighbourhood').count()

NameError: name 'venues' is not defined

In [22]:
venues_cat=pd.get_dummies(venues['categories'],prefix="", prefix_sep="")
venues_cat.head()

NameError: name 'venues' is not defined

In [23]:
venues_cat['Borough']=venues['Borough']
venues_cat['Neighbourhood']=venues['Neighbourhood']

new_columns=[venues_cat.columns[-2]]+[venues_cat.columns[-1]] +list(venues_cat.columns[:-2])
venues_cat=venues_cat[new_columns]
venues_cat

NameError: name 'venues' is not defined

In [26]:
venues_freq.shape

NameError: name 'venues_freq' is not defined

## Clustering the Neighbourhoods

In [27]:
from sklearn.cluster import KMeans

In [28]:
kclusters = 5

venues_clusters = venues_freq.drop(['Neighbourhood','Borough'],axis=1)

kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(venues_clusters)

kmeans.labels_

NameError: name 'venues_freq' is not defined

In [29]:
venues_freq['Cluster label'] = pd.DataFrame(kmeans.labels_)

select_columns=list(venues_freq.columns[0:2])+[venues_freq.columns[-1]]+list(venues_freq.columns[2:-1])

venues_freq=venues_freq[select_columns]
venues_freq

NameError: name 'kmeans' is not defined

## Creating the Cluster Map

In [30]:
Toronto_venues=df_Tor
Toronto_venues= Toronto_venues.merge(venues_freq.set_index(['Borough','Neighbourhood']),on=['Borough','Neighbourhood'])
Toronto_venues

NameError: name 'venues_freq' is not defined

In [31]:
map_clusters=folium.Map(location=[latitude,longitude],zoom_start=10)

x=np.arange(kclusters)

colors_array = ['red','blue','yellow','green', 'black']

for lat, long, clust, bor, neigh in zip(Toronto_venues['Latitude'],Toronto_venues['Longitude'],Toronto_venues['Cluster label'],Toronto_venues['Borough'],Toronto_venues['Neighbourhood']):
    
    label='{},{},Cluster {}'.format(bor,neigh,clust)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color=colors_array[clust],
        fill=True,
        fill_color=colors_array[clust],
        fill_opacity=0.7
        ).add_to(map_clusters)

map_clusters

KeyError: 'Cluster label'

ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.
