In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Send a request to the webpage
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Step 2: Parse the webpage content
soup = BeautifulSoup(response.text, 'html.parser')

# Step 3: Locate the table and extract data
table = soup.find('table', {'class': 'wikitable'})

# Define the headers manually based on the provided image
headers = [
    "Rank", "Country/Territory", "IMF Forecast", "IMF Year",
    "World Bank Estimate", "World Bank Year",
    "United Nations Estimate", "United Nations Year"
]
print(f"Headers: {headers}")

# Extract rows
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    if len(cells) > 0:  # Ensure the row has cells
        row_data = [cell.text.strip() for cell in cells]
        rows.append(row_data)

# Debugging: Check the number of rows and their lengths
print(f"Number of rows extracted: {len(rows)}")
for index, row in enumerate(rows):
    print(f"Row {index} has {len(row)} columns: {row}")

# Adjust headers if they don't match the number of columns in the data rows
# Check the number of columns in the rows
num_columns = len(rows[0])
if len(headers) > num_columns:
    # If there are more headers than columns, adjust the headers
    headers = headers[:num_columns]

# Step 4: Create a DataFrame and display the first 10 rows
if rows:
    df = pd.DataFrame(rows, columns=headers)  # Match length of headers to data
    print("First 10 rows of the data:")
    print(df.head(10))  # Print the DataFrame to check the content
else:
    print("No data rows were found.")

# Step 5: Save the data to a CSV file
csv_file_path = 'CSC221_webscrape_data.csv'
df.to_csv(csv_file_path, index=False)

print(f"Data has been saved to {csv_file_path}")


Headers: ['Rank', 'Country/Territory', 'IMF Forecast', 'IMF Year', 'World Bank Estimate', 'World Bank Year', 'United Nations Estimate', 'United Nations Year']
Number of rows extracted: 210
Row 0 has 7 columns: ['World', '109,529,216', '2024', '105,435,540', '2023', '100,834,796', '2022']
Row 1 has 7 columns: ['United States', '28,781,083', '2024', '27,360,935', '2023', '25,744,100', '2022']
Row 2 has 7 columns: ['China', '18,532,633', '[n 1]2024', '17,794,782', '[n 3]2023', '17,963,170', '[n 1]2022']
Row 3 has 7 columns: ['Germany', '4,591,100', '2024', '4,456,081', '2023', '4,076,923', '2022']
Row 4 has 7 columns: ['Japan', '4,110,452', '2024', '4,212,945', '2023', '4,232,173', '2022']
Row 5 has 7 columns: ['India', '3,937,011', '2024', '3,549,919', '2023', '3,465,541', '2022']
Row 6 has 7 columns: ['United Kingdom', '3,495,261', '2024', '3,340,032', '2023', '3,089,072', '2022']
Row 7 has 7 columns: ['France', '3,130,014', '2024', '3,030,904', '2023', '2,775,316', '2022']
Row 8 has 7 