# IBM Applied Data science Capstone

# Part 1: Dataset Creation 

## Installing Required Libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [9]:
pip install lxml

Collecting lxml
  Downloading lxml-4.5.1-cp37-cp37m-win_amd64.whl (3.5 MB)
Installing collected packages: lxml
Successfully installed lxml-4.5.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html_data = requests.get(url).text

wiki_data = BeautifulSoup(html_data, 'lxml')

In [5]:
#wiki_data

## Converting the data to pandas dataframe

In [63]:
column_names = ['Postalcode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

In [64]:
content = wiki_data.find('div', class_='mw-parser-output')
table = content.table.tbody

In [65]:
#table

In [66]:
postcode = None
borough = None
neighborhood = None
for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            postcode = td.text.strip('\n').replace(']','')
            i = i + 1
        elif i == 1:
            borough = td.text
            borough = td.text.strip('\n').replace(']','')
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto = toronto.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [67]:
toronto

Unnamed: 0,Postalcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
...,...,...,...
176,M5Z,Not assigned,Not assigned
177,M6Z,Not assigned,Not assigned
178,M7Z,Not assigned,Not assigned
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


'Not assigned'

In [58]:
toronto.iloc[4][1]

'North York'

In [21]:
## Data Cleaning

In [75]:
toronto = toronto[toronto.Borough!='Not assigned']
toronto.reset_index(drop = True, inplace = True)
i = 0
for i in range(0,toronto.shape[0]):
    if toronto.iloc[i][2] == 'Not assigned':
        toronto.iloc[i][2] = toronto.iloc[i][1]
        i = i+1

In [80]:
toronto = toronto.dropna()

In [82]:
toronto

Unnamed: 0,Postalcode,Borough,Neighborhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"
5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
99,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
100,M4Y,Downtown Toronto,Church and Wellesley
101,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
102,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [86]:
df = toronto.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [87]:
df.describe()


Unnamed: 0,Postalcode,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M6R,North York,Downsview
freq,1,24,4


In [88]:
df = df.dropna()
empty = 'Not assigned'
df = df[(df.Postalcode != empty ) & (df.Borough != empty) & (df.Neighborhood != empty)]

In [89]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [105]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['Postalcode', 'Borough'])
df_2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [109]:
df_2

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [110]:
print(df_2.shape)
df_2.head()

(103, 3)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [111]:
df_2.to_csv('toronto.csv', index=False)