## Week3: IBM Data Capstone Project - Exploring Toronto Neighborhoods

### Submitted by Samy Palaniappan

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

## Importing table from Wiki

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'


def data_import(url):
    '''
    function takes in a URL and returns the location of the first table in the webpage
    '''
    postalCode = []
    borough = []
    neighborhood = []
    
    data = requests.get(url).text
    soup = bs(data, 'html.parser')
    
    for row in soup.find('table').find_all('tr'):
        items = row.find_all('td')
        if(len(items)>0):
            postalCode.append(items[0].text.rstrip('\n'))
            borough.append(items[1].text.rstrip('\n'))
            neighborhood.append(items[2].text.rstrip('\n'))
            
    df = pd.DataFrame({"postalCode":postalCode, "borough":borough, "neighborhood":neighborhood})
    return df

df = data_import(url)
df.head(10)

Unnamed: 0,postalCode,borough,neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


## Exploring the Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   postalCode    180 non-null    object
 1   borough       180 non-null    object
 2   neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


In [4]:
df.shape

(180, 3)

In [5]:
df.nunique()

postalCode      180
borough          11
neighborhood    100
dtype: int64

In [6]:
df.isnull().sum()

postalCode      0
borough         0
neighborhood    0
dtype: int64

In [7]:
df.borough.value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Mississauga          1
Name: borough, dtype: int64

## Cleaning the Data

### dropping null values for borough

In [8]:
df = df[df.borough!='Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,postalCode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df.shape

(103, 3)

### Grouping boroughs

In [10]:
df_grouped = df.groupby(['borough']).agg(lambda x: ', '.join(x))
df_grouped.head(10)

Unnamed: 0_level_0,postalCode,neighborhood
borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,"M4N, M5N, M4P, M5P, M4R, M5R, M4S, M4T, M4V","Lawrence Park, Roselawn, Davisville North, For..."
Downtown Toronto,"M5A, M7A, M5B, M5C, M5E, M5G, M6G, M5H, M5J, M...","Regent Park, Harbourfront, Queen's Park, Ontar..."
East Toronto,"M4E, M4K, M4L, M4M, M7Y","The Beaches, The Danforth West, Riverdale, Ind..."
East York,"M4B, M4C, M4G, M4H, M4J","Parkview Hill, Woodbine Gardens, Woodbine Heig..."
Etobicoke,"M9A, M9B, M9C, M9P, M9R, M8V, M9V, M8W, M9W, M...","Islington Avenue, Humber Valley Village, West ..."
Mississauga,M7R,Canada Post Gateway Processing Centre
North York,"M3A, M4A, M6A, M3B, M6B, M3C, M2H, M3H, M2J, M...","Parkwoods, Victoria Village, Lawrence Manor, L..."
Scarborough,"M1B, M1C, M1E, M1G, M1H, M1J, M1K, M1L, M1M, M...","Malvern, Rouge, Rouge Hill, Port Union, Highla..."
West Toronto,"M6H, M6J, M6K, M6P, M6R, M6S","Dufferin, Dovercourt Village, Little Portugal,..."
York,"M6C, M6E, M6M, M6N, M9N","Humewood-Cedarvale, Caledonia-Fairbanks, Del R..."


In [11]:
df_grouped.shape

(10, 2)

### setting neighborhood with 'Not assigned' tag to its corresponding borough tab

In [12]:
for row in range(len(df)):
    if df.loc[row, 'neighborhood'] =='Not assigned':
        df.loc[row, 'neighborhood'] == df.loc[row, 'borough']

In [13]:
df_neigh = df[df.neighborhood=='Not assigned']
df_neigh.shape

(0, 3)

In [14]:
df.shape

(103, 3)

In [15]:
print('The number of datapoints with uniqe postal_codes after cleaning is ', df.shape[0])

The number of datapoints with uniqe postal_codes after cleaning is  103


## Part-II

###  Adding latitude longitudes to corresponding postal codes.

In [16]:
lat_long_df = pd.read_csv('Geospatial_Coordinates.csv')
lat_long_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
full_df = df.merge(right=lat_long_df, how='left', left_on='postalCode', right_on='Postal Code')
full_df = full_df.drop(columns='Postal Code')
full_df.head()

Unnamed: 0,postalCode,borough,neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part-III

### Map Visualization of Toronto Neighborhoods