# Question 1

#### * Explore and cluster the neighborhoods in Toronto.


In [1]:
import requests
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup

In [2]:
# Scrapping data from wikipedia

url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup=BeautifulSoup(url,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":900271985,"wgRevisionId":900271985,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June",

In [3]:
# Data Frame 

data = []
columns = []
table = soup.find('table',{'class':'wikitable sortable'})
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
   
    if (index == 0):
        columns = section
    else:
        data.append(section)


ca_df = pd.DataFrame(data = data,columns = columns)
ca_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### * The dataframe consist of three columns: PostalCode, Borough, and Neighborhood

In [4]:
ca_df.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
ca_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### * Only processed the cells that have an assigned borough. Ignored cells with a borough that is Not assigned.

In [5]:
ca_df = ca_df[ca_df['Borough'] != 'Not assigned'].reset_index(drop=True)

ca_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### * These two rows combined into one row with the neighborhoods separated with a comma 

In [6]:
c_df = ca_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
c_df.columns = ['PostalCode', 'Borough', 'Neighbourhood']
c_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
c_df.shape

(103, 3)

# Question 2

#### To make a data frame including neighbourhood latitude and longitude

In [8]:
# used csv file for existing data
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Rename column mame
geo_data.rename(columns={'Postal Code':'PostalCode'})
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
for g,c in zip(geo_data['Postal Code'], ca_df['PostalCode']) :
    if(g==c):
        c_df['Latitude']=geo_data['Latitude']
        c_df['Longitude']=geo_data['Longitude']

In [11]:
c_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Question 3

#### Visualize above data in Map

In [12]:
# Install and import library for visualizing

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.9.0
import folium

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.



In [14]:

add='Toronto'
geoloc=Nominatim(user_agent="foursquare_agent")
loc=geoloc.geocode(add)
latitude=loc.latitude
longitude=loc.longitude
print(latitude,longitude )

43.653963 -79.387207


In [15]:
venue_map=folium.Map(location=[latitude,longitude],zoom_start=12)

folium.Marker(
    [latitude,longitude],
    popup=add,
    icon=folium.Icon(color='red')
    ).add_to(venue_map)

for lat,lng,label in zip(c_df.Latitude, c_df.Longitude, c_df.Neighbourhood):
    label = folium.Popup(label, parse_html=True)
    folium.Marker(
    [lat,lng],
    popup=label,
    icon=folium.Icon(color='blue')
    ).add_to(venue_map)
    
venue_map