# Toronto Neighbourhood Population etc Data

In [250]:
import pandas as pd
import numpy as np

In [251]:
#!conda install --yes beautifulsoup4

In [252]:
#!conda install --yes lxml

In [253]:
from bs4 import BeautifulSoup
import urllib.request as urlreq
import lxml
print('done')

done


# Downloading data from Wikipedia

In [276]:
with urlreq.urlopen("https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods") as response:
    TorontoPopulation_html = response.read()

In [277]:
soup = BeautifulSoup(TorontoPopulation_html, 'html.parser')
#print(soup.prettify())

In [278]:
# Getting the required table from the html data
#table = soup.find_all('table')[0] # finding the first table in the html
table = soup.find_all('table')[1]
#table
Popu_df = pd.read_html(str(table))[0] # conversion of html to string and then conversion to dataframes
Popu_df.head(3)

Unnamed: 0,Name,FM,Census Tracts,Population,Land area (km2),Density (people/km2),% Change in Population since 2001,Average Income,Transit Commuting %,% Renters,Second most common language (after English) by name,Second most common language (after English) by percentage,Map
0,Toronto CMA Average,,All,5113149,5903.63,866,9.0,40704,10.6,11.4,,,
1,Agincourt,S,"0377.01, 0377.02, 0377.03, 0377.04, 0378.02, 0...",44577,12.45,3580,4.6,25750,11.1,5.9,Cantonese (19.3%),19.3% Cantonese,
2,Alderwood,E,"0211.00, 0212.00",11656,4.94,2360,-4.0,35239,8.8,8.5,Polish (6.2%),06.2% Polish,


In [279]:
# Selecting required columns and setting their names for concurrency and ease
Popu_Multi = pd.DataFrame(columns = ['Neighbourhood', 'Population', 'Density', 'Avg Income', 'Transit_Commuting_Percent', 'Second_Language'])
Popu_Multi[['Neighbourhood', 'Population', 'Density', 'Avg Income', 'Transit_Commuting_Percent', 'Second_Language']] = Popu_df[Popu_df.columns.values[[0,3,5,7,8,11]]]
Popu_Multi.head()

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language
0,Toronto CMA Average,5113149,866,40704,10.6,
1,Agincourt,44577,3580,25750,11.1,19.3% Cantonese
2,Alderwood,11656,2360,35239,8.8,06.2% Polish
3,Alexandra Park,4355,13609,19687,13.8,17.9% Cantonese
4,Allenby,2513,4333,245592,5.2,01.4% Russian


In [280]:
# Removing first row which is not a neighbourhood
Popu_Multi = Popu_Multi.iloc[1:]
Popu_Multi.head()

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language
1,Agincourt,44577,3580,25750,11.1,19.3% Cantonese
2,Alderwood,11656,2360,35239,8.8,06.2% Polish
3,Alexandra Park,4355,13609,19687,13.8,17.9% Cantonese
4,Allenby,2513,4333,245592,5.2,01.4% Russian
5,Amesbury,17318,4934,27546,16.4,06.1% Spanish


In [281]:
Popu_Multi.reset_index(drop=True, inplace=True)

In [282]:
# Splitting the column values on the basis of % symbol to seperate percentage and respective population
pp = Popu_Multi['Second_Language'].str.split('%')
pp

0       [19.3,  Cantonese]
1          [06.2,  Polish]
2       [17.9,  Cantonese]
3         [01.4,  Russian]
4         [06.1,  Spanish]
              ...         
169      [09.1,  Gujarati]
170    [02.7,  Portuguese]
171        [04.0,  Korean]
172       [06.6,  Italian]
173        [01.9,  French]
Name: Second_Language, Length: 174, dtype: object

In [283]:
Popu_Multi.drop(columns = 'Second_Language', axis=1, inplace = True)

In [284]:
Popu_Multi['Second_Language_Percent'] = [t[0] for t in pp]
Popu_Multi['Second_Language'] = [t[1].strip() for t in pp]
Popu_Multi.head()

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language_Percent,Second_Language
0,Agincourt,44577,3580,25750,11.1,19.3,Cantonese
1,Alderwood,11656,2360,35239,8.8,6.2,Polish
2,Alexandra Park,4355,13609,19687,13.8,17.9,Cantonese
3,Allenby,2513,4333,245592,5.2,1.4,Russian
4,Amesbury,17318,4934,27546,16.4,6.1,Spanish


In [285]:
# Neighbourhood Name seperation from alternate names and/or old names
firstname = Popu_Multi['Neighbourhood'].str.split("-|–|/|\(")
Popu_Multi.drop('Neighbourhood', axis=1, inplace=True)
Popu_Multi['Neighbourhood'] = [tmp[0].strip() for tmp in firstname]


In [286]:
Popu_Multi['Neighbourhood'] = Popu_Multi['Neighbourhood'].str.replace('The', '')
Popu_Multi['Neighbourhood'] = Popu_Multi['Neighbourhood'].str.replace('North', '')
Popu_Multi['Neighbourhood'] = Popu_Multi['Neighbourhood'].str.replace('South', '')
Popu_Multi['Neighbourhood'] = Popu_Multi['Neighbourhood'].str.replace('East', '')
Popu_Multi['Neighbourhood'] = Popu_Multi['Neighbourhood'].str.replace('West', '')
Popu_Multi['Neighbourhood'] = [tmp.strip() for tmp in Popu_Multi['Neighbourhood']]

In [287]:
Popu_Multi.head()

Unnamed: 0,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language_Percent,Second_Language,Neighbourhood
0,44577,3580,25750,11.1,19.3,Cantonese,Agincourt
1,11656,2360,35239,8.8,6.2,Polish,Alderwood
2,4355,13609,19687,13.8,17.9,Cantonese,Alexandra Park
3,2513,4333,245592,5.2,1.4,Russian,Allenby
4,17318,4934,27546,16.4,6.1,Spanish,Amesbury


In [288]:
Popu_Multi.shape

(174, 7)

In [289]:
# setting last column location to first
Popu_Multi = Popu_Multi[Popu_Multi.columns[[6,0,1,2,3,4,5]]]

In [290]:
Popu_Multi.head()

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language_Percent,Second_Language
0,Agincourt,44577,3580,25750,11.1,19.3,Cantonese
1,Alderwood,11656,2360,35239,8.8,6.2,Polish
2,Alexandra Park,4355,13609,19687,13.8,17.9,Cantonese
3,Allenby,2513,4333,245592,5.2,1.4,Russian
4,Amesbury,17318,4934,27546,16.4,6.1,Spanish


In [291]:
Popu_Multi.dtypes

Neighbourhood                 object
Population                     int64
Density                        int64
Avg Income                     int64
Transit_Commuting_Percent    float64
Second_Language_Percent       object
Second_Language               object
dtype: object

In [292]:
# People with Second_Language_Percent in a Neighbourhood should be a floating type value so is converted
Popu_Multi['Second_Language_Percent'] = Popu_Multi['Second_Language_Percent'].astype(float)

In [293]:
Popu_Multi.head(3)

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language_Percent,Second_Language
0,Agincourt,44577,3580,25750,11.1,19.3,Cantonese
1,Alderwood,11656,2360,35239,8.8,6.2,Polish
2,Alexandra Park,4355,13609,19687,13.8,17.9,Cantonese


In [294]:
Popu_Multi.dtypes

Neighbourhood                 object
Population                     int64
Density                        int64
Avg Income                     int64
Transit_Commuting_Percent    float64
Second_Language_Percent      float64
Second_Language               object
dtype: object

In [295]:
# Converting Values against 'Italian' as Second Language to 1 else 0, to focus on Italian Community

for i, elm in enumerate(Popu_Multi['Second_Language']):
    if elm != 'Italian': 
        Popu_Multi['Second_Language_Percent'].loc[i]=0

Popu_Multi = Popu_Multi.drop('Second_Language', axis=1)


In [297]:
Popu_Multi.head()

Unnamed: 0,Neighbourhood,Population,Density,Avg Income,Transit_Commuting_Percent,Second_Language_Percent
0,Agincourt,44577,3580,25750,11.1,0.0
1,Alderwood,11656,2360,35239,8.8,0.0
2,Alexandra Park,4355,13609,19687,13.8,0.0
3,Allenby,2513,4333,245592,5.2,0.0
4,Amesbury,17318,4934,27546,16.4,0.0


In [298]:
Popu_Multi.to_csv('CS_Population_Multiple_Data.csv', index=False)