In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Retrieve the web page
url = "https://en.wikipedia.org/wiki/List_of_Canadian_provinces_and_territories_by_historical_population"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:  # status code of 200 indicates a successful request
    raw_html = response.text
    print("Webpage retrieved successfully")
    
    # Decode the raw HTML using BeautifulSoup
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Extract relevant tables
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    # Print number of tables
    print(f"Number of tables found: {len(tables)}")
    
    # Function to sanitize text
    def sanitize(text):
        return text.strip()
    
    # Initialize an empty dictionary to hold all data
    all_data = {}
    all_periods = []
    
    for table in tables:
        headers = [(th.get_text()).strip() for th in table.find_all('th')]
        all_periods.extend(headers[1:])  # Collect all period headers excluding the 'Name' header
        
        for row in table.find_all('tr')[1:]:  # skip header row
            cells = row.find_all(['td', 'th'])
            if len(cells) == len(headers):
                name = sanitize(cells[0].get_text())
                if name not in all_data:
                    all_data[name] = {header: "N/A" for header in all_periods}
                
                for i in range(1, len(headers)):
                    period = headers[i]
                    value = sanitize(cells[i].get_text())
                    all_data[name][period] = value
    
    # Remove duplicates from all_periods while maintaining order
    seen = set()
    ordered_periods = []
    for period in all_periods:
        if period not in seen:
            ordered_periods.append(period)
            seen.add(period)
    
    # Convert the dictionary to a format suitable for pandas DataFrame
    formatted_data = {'Name': [], **{period: [] for period in ordered_periods}}
    
    for name, values in all_data.items():
        formatted_data['Name'].append(name)
        for period in ordered_periods:
            formatted_data[period].append(values.get(period, "N/A"))
    
    # Convert the final dictionary to a pandas DataFrame
    df = pd.DataFrame(formatted_data)
    
    # Display the DataFrame
    print(df.head(60))

else:
    print(f"Failed to retrieve webpage: {response.status_code}")

df.head(60)

Webpage retrieved successfully
Number of tables found: 4
                            Name    1700    1725    1750     1775     1800  \
0                   Lower Canada  14,000  29,000  54,500   96,000  225,000   
1               New Brunswick[a]                                    10,000   
2                   Newfoundland     500   5,000  10,000   16,000   10,000   
3                 Nova Scotia[b]   1,300   5,000  14,000   20,000   57,000   
4        Prince Edward Island[c]             300   2,500   10,000   20,000   
5                   Upper Canada                            8,000   50,000   
6                          Total  15,800  39,300  81,000  150,000  382,000   
7                        Alberta     N/A     N/A     N/A      N/A      N/A   
8               British Columbia     N/A     N/A     N/A      N/A      N/A   
9                       Manitoba     N/A     N/A     N/A      N/A      N/A   
10                 New Brunswick     N/A     N/A     N/A      N/A      N/A   
11  New