## Segmentation and Clusturing in Toronto

## Task 1

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

In [4]:
data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

In [5]:
df = pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
df.shape

(181, 3)

In [8]:
not_assigned = 'Not assigned'
not_assigned_row = df[ (df.Borough == not_assigned) & (df.Neighborhood == not_assigned) ]
not_assigned_row.head(), not_assigned_row.shape

(Empty DataFrame
 Columns: [PostalCode, Borough, Neighborhood]
 Index: [], (0, 3))

In [9]:
df.drop(not_assigned_row.index, inplace=True)

In [10]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


In [11]:
df.shape

(180, 3)

In [12]:
def neighborhood_list(grouped):    
    if( len(grouped) == 1 ):
        # only one line under the postal code assign the Borough as Neighborhood
        borough = grouped['Borough'].tolist()[0] 
        neighborhood = grouped['Neighborhood'].tolist()[0] 
        if( neighborhood) == not_assigned:
            return borough
        else:
            return neighborhood
    else:
        # transform grouped Neighborhood as single value separated with commas
        return ', '.join(sorted(grouped['Neighborhood'].tolist())) 
                    
grp = df.groupby(['PostalCode', 'Borough'])
df2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [13]:
df2[df2.PostalCode == 'M7A']

Unnamed: 0,PostalCode,Borough,Neighborhood
120,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [14]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern / Rouge
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,M1E,Scarborough,Guildwood / Morningside / West Hill
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
8,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
9,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West


## Task 2

In [15]:
#!pip install geocoder
import geocoder

In [16]:
df=pd.read_csv('task1.csv')

In [17]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords
    
    
get_latlng('M4G')

[43.70941386000004, -79.36309957799995]

## Get PostalCode Co-ordinates

In [20]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [21]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.808626,-79.189913
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.785779,-79.157368
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.765806,-79.185284
3,M1G,Scarborough,Woburn,43.771545,-79.218135
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813


## Task 3

In [22]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


In [23]:
toronto_coords = get_latlng('')
toronto_coords

[43.648690000000045, -79.38543999999996]

In [24]:
df3 = df2[ df2.Borough.str.contains('Toronto') ]

In [25]:
df3

Unnamed: 0,PostalCode,Borough,Neighborhood
63,M4E,East Toronto,The Beaches
67,M4K,East Toronto,The Danforth West / Riverdale
68,M4L,East Toronto,India Bazaar / The Beaches West
69,M4M,East Toronto,Studio District
70,M4N,Central Toronto,Lawrence Park
71,M4P,Central Toronto,Davisville North
72,M4R,Central Toronto,North Toronto West
73,M4S,Central Toronto,Davisville
74,M4T,Central Toronto,Moore Park / Summerhill East
75,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...


## Create Data on map

In [28]:
toronto_data = pd.DataFrame(df3)

In [29]:
toronto_data = toronto_data.reset_index().drop('index', axis=1)

In [30]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4E,East Toronto,The Beaches
1,M4K,East Toronto,The Danforth West / Riverdale
2,M4L,East Toronto,India Bazaar / The Beaches West
3,M4M,East Toronto,Studio District
4,M4N,Central Toronto,Lawrence Park


In [31]:
toronto_data.loc[0, 'Neighborhood']

'The Beaches'

In [33]:
# type your answer here
VERSION = '20180605'
CLIENT_ID = fs_act['client_id']
CLIENT_SECRET = fs_act['client_secret']
latitude = neighborhood_latitude
longitude = neighborhood_longitude
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION,radius, LIMIT)


TypeError: 'NoneType' object is not subscriptable