# Coursera Capstone Project Week 3

## Part 1: Create a dataframe

### Import necessary libraries

In [1]:
from pandas.io.html import read_html
import pandas as pd 
import numpy as np

### Reading files as a dataframe

In [2]:
page='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table=read_html(page, index_col=0, attrs={"class":"wikitable"})
df=table[0].reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df.shape

(288, 3)

### Dropping "Not Assigned" Values

In [4]:
df['Borough'].replace('Not assigned',np.nan,inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [5]:
for bor,neigh in zip(df['Borough'],df['Neighbourhood']):
    if neigh=='Not assigned':
        df['Neighbourhood'].replace(neigh,bor,inplace=True)

### Grouping Neighbourhoods with same Postcodes

In [6]:
#Grouping
df=df.groupby(['Postcode','Borough'])['Neighbourhood'].unique().reset_index()

#Removing brackets
neighs=[] 
df_neighd= df['Neighbourhood']
for neigh in df_neighd:
    neighs=np.append(neighs,', '.join(neigh)) 

df['Neighbourhood']=pd.DataFrame(neighs) 
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df.shape

(103, 3)

### Save the table in csv file

In [8]:
df.to_csv('postcode of Toronto.csv')

## Part 2: Locations coordinates

In [9]:
!wget --quiet http://cocl.us/Geospatial_data -O lldata.csv

In [10]:
df=pd.read_csv('lldata.csv')
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
pc=pd.read_csv('postcode of Toronto.csv')
pc.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood
0,0,M1B,Scarborough,"Rouge, Malvern"
1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,3,M1G,Scarborough,Woburn
4,4,M1H,Scarborough,Cedarbrae


In [12]:
new_data=pc.merge(df, left_on='Postcode',right_on='Postal Code')
new_data.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [13]:
geoinfo=new_data.drop(columns=['Unnamed: 0', 'Postal Code'], axis=1)
geoinfo

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Part 3: Clustering

In [35]:
new_data.to_csv('geoinfo.csv')

In [16]:
import folium
from geopy.geocoders import Nominatim

In [17]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [36]:
file_path='geoinfo.csv'

In [37]:
torontogeo=pd.read_csv(file_path)
torontogeo.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,0,0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,3,3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,4,4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [38]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontogeo['Latitude'], torontogeo['Longitude'], torontogeo['Borough'], torontogeo['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

In [39]:
dft = torontogeo[torontogeo['Borough'].str.contains('Toronto', regex=False, case=False, na=False)]
dft.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [40]:
t_borough=dft.reset_index()
t_borough.drop(columns=['index'], axis=1, inplace=True)
print(t_borough.shape)
t_borough.head()

(38, 7)


Unnamed: 0,Unnamed: 0.1,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",M4K,43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",M4L,43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,M4M,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879


In [42]:
#save the dataframe
t_borough.to_csv('t_borough.csv')

In [43]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(t_borough['Latitude'], t_borough['Longitude'], t_borough['Borough'], t_borough['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Fetch the information of activities around neighborhoods

In [44]:
CLIENT_ID = 'DQLRPDS3VFILKM1YBTFC2RDVKZBTNILT3NIAMY5CUYZMCE2R' #Foursquare ID
CLIENT_SECRET = '12F5OXQO2JMPQRKOHBGKYF4DPLY2AQA02KFF0YWKSGRX2HKX' #Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: DQLRPDS3VFILKM1YBTFC2RDVKZBTNILT3NIAMY5CUYZMCE2R
CLIENT_SECRET:12F5OXQO2JMPQRKOHBGKYF4DPLY2AQA02KFF0YWKSGRX2HKX


In [45]:
import requests

In [46]:
def getNearbyVenues(names, latitudes, longitudes, LIMIT=100, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [47]:
tborough_venues=getNearbyVenues(names=t_borough['Borough'], 
                                latitudes=t_borough['Latitude'], 
                                longitudes=t_borough['Longitude'], 
                                LIMIT=100,
                                radius=500)

East Toronto
East Toronto
East Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
West Toronto
West Toronto
West Toronto
West Toronto
West Toronto
West Toronto
East Toronto


In [49]:
print(tborough_venues.shape)
tborough_venues.head()

(1707, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,East Toronto,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,East Toronto,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,East Toronto,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,East Toronto,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,East Toronto,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [50]:
#save the dataframe
tborough_venues.to_csv('tborough_venues.csv')

In [51]:
tborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,109,109,109,109,109,109
Downtown Toronto,1290,1290,1290,1290,1290,1290
East Toronto,127,127,127,127,127,127
West Toronto,181,181,181,181,181,181


### Analyze each neighborhood

In [52]:
# one hot encoding
toronto_onehot = pd.get_dummies(tborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = tborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(1707, 233)


Unnamed: 0,Borough,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
tborough_grouped=toronto_onehot.groupby('Borough').mean().reset_index()
tborough_grouped

Unnamed: 0,Borough,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018349,0.0,...,0.0,0.009174,0.009174,0.0,0.009174,0.0,0.009174,0.0,0.0,0.009174
1,Downtown Toronto,0.000775,0.000775,0.000775,0.000775,0.00155,0.002326,0.00155,0.014729,0.00155,...,0.000775,0.000775,0.000775,0.002326,0.013178,0.002326,0.004651,0.006202,0.000775,0.002326
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023622,0.0,...,0.007874,0.0,0.015748,0.0,0.0,0.0,0.0,0.0,0.0,0.023622
3,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01105,0.0,0.01105,0.01105,0.0,0.005525
