# Segmenting and Clustering Neighborhoods in Toronto: part 2 (from line 15).
## Applied Data Science Capstone Week 3 Peer-Graded Assignment.

### Let us install some of the libraries which we will need later on.

In [1]:
import sys
!{sys.executable} -m pip install lxml
print("lxml installed")
!{sys.executable} -m pip install beautifulsoup4
print("beautifulsoup4 installed")
!{sys.executable} -m pip install geopy
print("geopy installed")
!{sys.executable} -m pip install folium
print("folium installed")

lxml installed
beautifulsoup4 installed
geopy installed
folium installed


### Importing required libraries

In [2]:
import lxml
import lxml.html as lh

import requests # library to handle requests
from requests import get
import urllib.request

from bs4 import BeautifulSoup # for webscraping import Beautiful Soup

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import csv



### Scraping the wikipedia page.

In [3]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"   # link of the destination wikipedia page.
page  = urllib.request.urlopen(link).read()   # source code of the destination page

In [4]:
# step by step scraping of the above source code of the destination page to get the table data we require

# parsing the data in the variable 'page' by using lxml
full_page = BeautifulSoup(page, 'lxml')

# finding the required table 
table_data = full_page.find('table', class_ = 'wikitable')

# finding the required data in the above identified table
trs = table_data.find_all('tr')

### Now let us move forward with making a pandas dataframe with these data.
#### We will do that by first aggregating these data into a variable; 

In [5]:
data = ""                                # its an empty string in which we will aggregate our whole data.
for x in trs:                            # 'trs' is the variable in which the data is stored from web-scrapping the wikipedia page.
    row = ""                             # its another empty string in which we will aggregate row wise data.
    for y in x.find_all('td'): 
        row = row + "," + y.text         # using ',' as a delimiter we are aggregating the data according to each row. 
    data = data + row                    # then we are appending the variable 'data' by adding each row to it.
print(data)

,M1A,Not assigned,Not assigned
,M2A,Not assigned,Not assigned
,M3A,North York,Parkwoods
,M4A,North York,Victoria Village
,M5A,Downtown Toronto,Harbourfront
,M6A,North York,Lawrence Heights
,M6A,North York,Lawrence Manor
,M7A,Downtown Toronto,Queen's Park
,M8A,Not assigned,Not assigned
,M9A,Queen's Park,Not assigned
,M1B,Scarborough,Rouge
,M1B,Scarborough,Malvern
,M2B,Not assigned,Not assigned
,M3B,North York,Don Mills North
,M4B,East York,Woodbine Gardens
,M4B,East York,Parkview Hill
,M5B,Downtown Toronto,Ryerson
,M5B,Downtown Toronto,Garden District
,M6B,North York,Glencairn
,M7B,Not assigned,Not assigned
,M8B,Not assigned,Not assigned
,M9B,Etobicoke,Cloverdale
,M9B,Etobicoke,Islington
,M9B,Etobicoke,Martin Grove
,M9B,Etobicoke,Princess Gardens
,M9B,Etobicoke,West Deane Park
,M1C,Scarborough,Highland Creek
,M1C,Scarborough,Rouge Hill
,M1C,Scarborough,Port Union
,M2C,Not assigned,Not assigned
,M3C,North York,Flemingdon Park
,M3C,North York,Don Mills South
,M4C,East York,Woodbine Height

#### Then making a CSV file and writing the data in that CSV file; 

In [6]:
data_file = open("toronto_data.csv",'w')   
data_file.write(data)
data_file.close()

# so we have a variable now called 'data_file' with all the data in it and ',' as a delimiter.

#### And lastly converting that into a pandas dataframe.

In [7]:
data_file_df = pd.read_csv('toronto_data.csv',header= None)            # converting the CSV fie into a pandas dataframe

data_file_df.head()

Unnamed: 0,0,1,2,3
0,,M1A,Not assigned,Not assigned
1,,M2A,Not assigned,Not assigned
2,,M3A,North York,Parkwoods
3,,M4A,North York,Victoria Village
4,,M5A,Downtown Toronto,Harbourfront


#### Okay so now let's clean our dataset.

#### The first step towards cleaning the dataset is making sure that we have only three columns here : 'PostalCode', 'Borough', 'Neighborhood'.
#### So we have to delete the first column in our dataset.
#### And rename the columns.

In [8]:
data_file_df.drop(columns = [0], axis= 1, inplace = True)   # droping the first column in the dataframe as it has only NaN values which are not required
data_file_df.head()

Unnamed: 0,1,2,3
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
data_file_df.columns = ['PostalCode', 'Borough', 'Neighborhood']       # naming the columns
data_file_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


####  Now that we have our dataset fairly clean; the second step is to pre-process our dataset.
#### In order to do that we have to do three things :
##### 1. Only process the cells which have assigned boroughs, ignoring the rest of the cells with a borough that is **Not assigned**.
##### 2. Grouping the dataset by postal codes, because more that one neighborhood can exist in a postal code.
##### 3. If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the **same** as the borough.

In [10]:
data_file_df.drop(data_file_df.loc[data_file_df['Borough']== 'Not assigned'].index, inplace=True)  # Dropping cells with a Borough that is 'Not assigned'.
data_file_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [11]:
data_file_df.loc[data_file_df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = data_file_df['Borough']   # asigning the neighborhood = borough for a not assigned neighborhood
data_file_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [12]:
data_file_df = data_file_df.groupby(['PostalCode',], sort= False,).agg( ', '.join)   # grouping the dataset by the postal code
data_file_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,"North York, North York","Lawrence Heights, Lawrence Manor"
M7A,Downtown Toronto,Queen's Park


In [13]:
data_file_df.reset_index(inplace = True) # resetting the index of the dataset to 0 initial.
data_file_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,"North York, North York","Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [14]:
data_file_df.shape

(103, 3)

#### Nest step is we need to get the latitude and the longitude coordinates of each neighborhood; in order to utilize the Foursquare location data.
##### For that we will be using the given link in the course submission page in order to get the latitude and longitude data that is required.

In [15]:
lat_long_data_url='http://cocl.us/Geospatial_data' # our destination URL consisting of the CSV file of our data.

# sending a HTTP request, opening and saving the contents of the response as 'tf', and opening a csv file to write the contens of the response to it.
with urllib.request.urlopen(lat_long_data_url) as tf, open('lat_long_data.csv', 'w') as f:       
    f.write(tf.read().decode())                                                             # writing the contents of the response to the opened csv file.
    f.close()                                                                               # finally closing the csv file.
    

In [16]:
df_lat_long = pd.read_csv('lat_long_data.csv')
df_lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
# renming the coluomn name from 'Postal Code' to 'PostalCode' ; so that it matches the column name from the 'data_file_df'.
df_lat_long = df_lat_long.rename(columns = {"Postal Code" : "PostalCode"})
# df_lat_long["Postal Code"] = "PostalCode"
df_lat_long.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Okay now we need to merge the two datasets.

In [18]:
final_data = pd.merge(data_file_df, df_lat_long, on='PostalCode')
final_data.head(25)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,"North York, North York","Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,"Scarborough, Scarborough","Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,"East York, East York","Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,"Downtown Toronto, Downtown Toronto","Ryerson, Garden District",43.657162,-79.378937


In [19]:
final_data.shape

(103, 5)