# Python Notebook for web scrapping and structuring neighborhood data in Toronto

## Installing beautifulsoap library

In [1]:
!conda install -c conda-forge beautifulsoup4 --yes
print('BS4 library installed')

Collecting package metadata: done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/linux-64::mkl-service==1.1.2=py37h90e4bf4_5
  - defaults/linux-64::numba==0.39.0=py37h04863e7_0
  - defaults/linux-64::numexpr==2.6.8=py37hd89afb7_0
  - defaults/linux-64::odo==0.5.1=py37_0
  - defaults/linux-64::pytables==3.4.4=py37ha205bf6_0
  - defaults/linux-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/linux-64::pytest-astropy==0.4.0=py37_0
  - defaults/linux-64::pytest-doctestplus==0.1.3=py37_0
  - defaults

In [2]:
from bs4 import BeautifulSoup
import requests #to handle urls
import pandas as pd
print('imports successful')

imports successful


## Webscrapping using BeautifulSoup

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')

table = soup.find('div',class_='mw-parser-output').table.tbody

#### Creating headers

In [4]:
headers = []

for tr in table.find_all('tr'):
    for th in tr.find_all('th'):
        header_item = th.text
        headers.append(header_item)
headers[2] = headers[2].replace('\n','')
headers[0] = headers[0].replace('c','C') 

print(headers)

['PostCode', 'Borough', 'Neighbourhood']


#### Inserting rows & columns to DataFrame

In [5]:
data = []
for tr in table.find_all('tr'):
    for td in tr.find_all('td'):
        data_item = td.text
        data.append(data_item)
data = [d_str.replace('\n', '') for d_str in data]
#print(data)


In [6]:
#Converting list to list of lists...
def divide_chunks(l, n):  
    for i in range(0, len(l), n):  
        yield l[i:i + n]
        
data = list(divide_chunks(data,3))
#print(data)

In [29]:
df_toronto = pd.DataFrame(data=data,columns=headers)
df_toronto

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Manipulating dataframe on assignment conditions

In [30]:
df_toronto = df_toronto[df_toronto.Borough != 'Not assigned']
df_toronto.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [34]:
#df_toronto.reset_index(inplace=True)
#df_toronto.drop(columns='index',inplace=True)
df_toronto.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [35]:
for i in range(df_toronto.shape[0]):
    if df_toronto['Neighbourhood'][i]=='Not assigned':
        df_toronto['Neighbourhood'][i] = df_toronto['Borough'][i]
df_toronto.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [56]:
df_toronto_ser = df_toronto.groupby(['PostCode','Borough'],as_index=True)['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
df_toronto_grp = pd.DataFrame(df_toronto_ser)
#df_toronto_grp.head()

In [57]:
df_toronto_grp.reset_index(inplace=True)
#df_toronto_grp.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [61]:
#First 10 rows
df_toronto_grp.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [62]:
#Last 10 rows
df_toronto_grp.tail(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
93,M9A,Etobicoke,Islington Avenue
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
95,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Emery, Humberlea"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


#### Shape of the Dataframe

In [63]:
df_toronto_grp.shape

(103, 3)