In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os


#Retrieve the web page
url = "https://en.wikipedia.org/wiki/List_of_Canadian_provinces_and_territories_by_historical_population"
#sends a GET request to the url and stores the response
response = requests.get(url)


In [9]:
raw_html = response.text
print("Webpage retrieved successfully")
        
#Decode the raw html using beautifulSoup
soup = BeautifulSoup(raw_html, 'html.parser')
print(soup.prettify()) #formats html and prints it

#extract relevant tables
tables = soup.find_all('table', {'class' : 'wikitable'})

#print # of tables ()
print(len(tables))
#display the first table
if tables:
    print(tables[0].prettify())
    
#merge relevant data into a dictionary
tables_dict = {}

Webpage retrieved successfully
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-enabled vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Canadian provinces and territories by historical population - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-di

In [10]:
for table in tables: #iterate through each table found on the url
    headers = [re.sub(r"\[([a-z])*\]", "",(th.get_text()).strip()) for th in table.find_all('th')] #extracts all header cell text, sanitizes it, then stores the clean text in a list 

    for row in table.find_all('tr')[1:]: #Loops through all rows in a given table except the header row
        cells = row.find_all(['td','th']) # Finds all data cells ('<td>') and header cells ('<th>') in the given row
        if len(cells) == len(headers): #ensure that only rows with the correct # of columns are processed
            for i, cell in enumerate(cells): #Iterate through each cell in a row
                header = headers[i] #get corresponding header
                value = re.sub(r"\[([a-z])*\]", "",(cell.get_text()).strip()) #extract value of the cell
                if header not in tables_dict:
                    tables_dict[header] = [] #append header to dictionary if it doesn't already exist
                    
                tables_dict[header].append(value)
                
#print the complete dictionary
tables_dict['Name'] = list(dict.fromkeys(tables_dict['Name']))
print(tables_dict)

{'Name': ['Lower Canada', 'New Brunswick', 'Newfoundland', 'Nova Scotia', 'Prince Edward Island', 'Upper Canada', 'Total', 'Alberta', 'British Columbia', 'Manitoba', 'Newfoundland and Labrador', 'Northwest Territories', 'Nunavut', 'Ontario', 'Quebec', 'Saskatchewan', 'Yukon', 'Canada'], '1700': ['14,000', '', '500', '1,300', '', '', '15,800'], '1725': ['29,000', '', '5,000', '5,000', '300', '', '39,300'], '1750': ['54,500', '', '10,000', '14,000', '2,500', '', '81,000'], '1775': ['96,000', '', '16,000', '20,000', '10,000', '8,000', '150,000'], '1800': ['225,000', '10,000', '10,000', '57,000', '20,000', '50,000', '382,000'], '1825': ['450,000', '75,000', '45,759', '150,000', '28,600', '158,027', '907,386'], 'Confederated': ['1905', '1871', '1870', '1867', '1949', '1870', '1867', '1999', '1867', '1873', '1867', '1905', '1898', ''], '1841': ['', '62,100', '4,704', '156,162', '96,296', '', '202,575', '', '466,831', '47,042', '716,670', '', '', '1,752,380'], '1851': ['', '55,000', '5,391', 

In [11]:


for header in tables_dict:
    print(len(tables_dict[header]))

18
7
7
7
7
7
7
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14


In [12]:
print(tables_dict.keys())

dict_keys(['Name', '1700', '1725', '1750', '1775', '1800', '1825', 'Confederated', '1841', '1851', '1861', '1871', '1881', '1891', '1901', '1911', '1921', '1931', '1941', '1951', '1956', '1961', '1966', '1971', '1976', '1981', '1986', '1991', '1996', '2001', '2006', '2011', '2016', '2021'])


In [13]:
print(tables_dict['1941'])

['796,169', '817,861', '729,744', '457,401', '316,294', '12,028', '577,962', '', '3,787,655', '95,047', '3,331,882', '895,992', '4,914', '11,828,474']


In [14]:

h2_list = soup.find_all("h2")

In [15]:
h2_list[0].text

'Contents'

In [16]:
for h2 in h2_list:
    print(h2.text)

Contents
1700 to 1825[edit]
1841 to 1931[edit]
1941 to 1991[edit]
1996 to 2021[edit]
Notes[edit]


In [None]:
print(tables)

In [28]:
hyperlinks = []
for table in tables:
    for link in table.find_all('a'):
        hyperlinks.append(link.get('href'))
print(hyperlinks)
    

['/wiki/Lower_Canada', '/wiki/Nova_Scotia', '/wiki/New_Brunswick', '#cite_note-6', '/wiki/Newfoundland_Colony', '/wiki/Nova_Scotia', '#cite_note-7', '/wiki/Nova_Scotia', '/wiki/Prince_Edward_Island', '#cite_note-8', '/wiki/Upper_Canada', '#cite_note-date-9', '#cite_note-10', '/wiki/Alberta', '#cite_note-NWT2-11', '/wiki/British_Columbia', '/wiki/Manitoba', '#cite_note-12', '#cite_note-13', '/wiki/New_Brunswick', '#cite_note-14', '/wiki/Newfoundland_and_Labrador', '#cite_note-NFL-15', '#cite_note-16', '#cite_note-17', '#cite_note-18', '#cite_note-19', '#cite_note-20', '#cite_note-21', '#cite_note-22', '#cite_note-23', '#cite_note-24', '/wiki/Northwest_Territories', '/wiki/Nova_Scotia', '#cite_note-25', '/wiki/Nunavut', '#cite_note-NWT-26', '/wiki/Ontario', '#cite_note-27', '#cite_note-CW-28', '#cite_note-CW-28', '/wiki/Prince_Edward_Island', '#cite_note-29', '/wiki/Quebec', '#cite_note-30', '#cite_note-CE-31', '#cite_note-CE-31', '/wiki/Saskatchewan', '#cite_note-NWT2-11', '/wiki/Yukon'

In [None]:
# regex = href="[^ ]*"
hyperlinks = []
for table in tables:
    hyperlinks.extend(re.findall('href="[^ ]*"', table.prettify()))
for i in range(len(hyperlinks)):
    hyperlinks[i] = hyperlinks[i][6:-1]

In [None]:
hyperlinks

In [None]:
dir = "webpages"

In [None]:
hyperlinks = list(dict.fromkeys(hyperlinks))
fn = 0
for link in hyperlinks:
    if link.startswith('#'):
        continue
    
    fp = os.path.join(dir, str(fn) +".html")
    url = "https://en.wikipedia.org"+link
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve webpage: {response.status_code}")
        break
    
    raw_html = response.text
    soup = BeautifulSoup(raw_html, 'html.parser')
    f = open(str(fn) + ".html","w")
    f.write(str(soup.encode("utf8")))
    f.close()
    fn+= 1
    
    