<a href="https://colab.research.google.com/github/ReemFarah/ReemFarah.github.io/blob/main/CC5_Scraper_Implement_a_data_scraper_of_your_own.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the website
url = "http://books.toscrape.com/"

# Send a GET request to fetch the raw HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract book details
books = soup.find_all('article', class_='product_pod')

# Parse data into a list of dictionaries
data = []
for book in books:
    title = book.h3.a['title']  # Book title
    price = book.find('p', class_='price_color').text.strip('£')  # Price
    availability = book.find('p', class_='instock availability').text.strip()  # Availability
    data.append({'Title': title, 'Price (£)': float(price), 'Availability': availability})

# Convert to DataFrame
books_df = pd.DataFrame(data)

# Preview the DataFrame
print(books_df.head())


                                   Title  Price (£) Availability
0                   A Light in the Attic      51.77     In stock
1                     Tipping the Velvet      53.74     In stock
2                             Soumission      50.10     In stock
3                          Sharp Objects      47.82     In stock
4  Sapiens: A Brief History of Humankind      54.23     In stock


In [4]:
# Convert DataFrame to TIDY format
tidy_books = books_df.melt(id_vars=['Title'], var_name='Metric', value_name='Value')

# Preview the normalized data
print(tidy_books.head())


                                   Title     Metric  Value
0                   A Light in the Attic  Price (£)  51.77
1                     Tipping the Velvet  Price (£)  53.74
2                             Soumission  Price (£)   50.1
3                          Sharp Objects  Price (£)  47.82
4  Sapiens: A Brief History of Humankind  Price (£)  54.23


In [5]:
# Export to CSV
tidy_books.to_csv("tidy_books_data.csv", index=False)


In [6]:
import altair as alt

# Create a bar chart for book prices
chart = alt.Chart(books_df).mark_bar(color='skyblue').encode(
    x=alt.X('Title:N', title='Book Title', sort='-y'),
    y=alt.Y('Price (£):Q', title='Price (£)'),
    tooltip=['Title', 'Price (£)', 'Availability']
).properties(
    title='Book Prices from Books to Scrape',
    width=800,
    height=400
)

# Display the chart
chart.show()


I chose "Books to Scrape" because it’s a beginner-friendly website designed for web scraping, with well-structured and accessible data for easy analysis and visualisation.

In [7]:
import pandas as pd

# Scrape data from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
tables = pd.read_html(url)

# Select the relevant table (assume the first table is the one needed)
population_table = tables[0]

# Preview the scraped data
print(population_table.head())


  Unnamed: 0       Location  Population % of world         Date  \
0          –          World  8119000000       100%   1 Jul 2024   
1    1/2 [b]          China  1409670000      17.3%  31 Dec 2023   
2    1/2 [b]          India  1402737000      17.2%   1 Jul 2024   
3          3  United States   340110988       4.2%   1 Jul 2024   
4          4      Indonesia   282477584       3.5%  30 Jun 2024   

  Source (official or from the United Nations) Notes  
0                          UN projection[1][3]   NaN  
1                         Official estimate[5]   [c]  
2                       Official projection[6]   [d]  
3                         Official estimate[7]   [e]  
4                National annual projection[8]   NaN  


In [11]:
# Check the number of columns and preview the data
print(population_table.columns)
print(population_table.head())




Index(['Unnamed: 0', 'Location', 'Population', '% of world', 'Date',
       'Source (official or from the United Nations)', 'Notes'],
      dtype='object')
  Unnamed: 0       Location  Population % of world         Date  \
0          –          World  8119000000       100%   1 Jul 2024   
1    1/2 [b]          China  1409670000      17.3%  31 Dec 2023   
2    1/2 [b]          India  1402737000      17.2%   1 Jul 2024   
3          3  United States   340110988       4.2%   1 Jul 2024   
4          4      Indonesia   282477584       3.5%  30 Jun 2024   

  Source (official or from the United Nations) Notes  
0                          UN projection[1][3]   NaN  
1                         Official estimate[5]   [c]  
2                       Official projection[6]   [d]  
3                         Official estimate[7]   [e]  
4                National annual projection[8]   NaN  


In [12]:
import pandas as pd

# Scrape the table from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
tables = pd.read_html(url)

# Select the first table and check its structure
population_table = tables[0]

# Rename columns based on your output
population_table.columns = [
    "Rank",
    "Location",
    "Population",
    "Percentage_of_World",
    "Date",
    "Source",
    "Notes",
]

# Drop unnecessary columns
population_table = population_table[["Location", "Population", "Percentage_of_World", "Date"]]

# Clean and normalize the Population column (remove commas and convert to numeric)
population_table["Population"] = (
    population_table["Population"]
    .astype(str)
    .str.replace(",", "")
    .str.extract(r"(\d+)")
    .astype(float)
)

# Clean the Percentage_of_World column (remove % and convert to float)
population_table["Percentage_of_World"] = (
    population_table["Percentage_of_World"]
    .str.replace("%", "")
    .astype(float)
)

# Preview the cleaned data
print(population_table.head())


        Location    Population  Percentage_of_World         Date
0          World  8.119000e+09                100.0   1 Jul 2024
1          China  1.409670e+09                 17.3  31 Dec 2023
2          India  1.402737e+09                 17.2   1 Jul 2024
3  United States  3.401110e+08                  4.2   1 Jul 2024
4      Indonesia  2.824776e+08                  3.5  30 Jun 2024


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_table["Population"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_table["Percentage_of_World"] = (


In [13]:
# Convert the table to TIDY format (long form)
tidy_population = population_table.melt(
    id_vars=["Location", "Date"],
    var_name="Metric",
    value_name="Value"
)

# Preview the TIDY data
print(tidy_population.head())



        Location         Date      Metric         Value
0          World   1 Jul 2024  Population  8.119000e+09
1          China  31 Dec 2023  Population  1.409670e+09
2          India   1 Jul 2024  Population  1.402737e+09
3  United States   1 Jul 2024  Population  3.401110e+08
4      Indonesia  30 Jun 2024  Population  2.824776e+08


In [14]:
# Save the TIDY data to a CSV file
tidy_population.to_csv("tidy_population_data.csv", index=False)


In [20]:
import altair as alt

# Select the top 10 most populous countries
top_10 = population_table.nlargest(10, "Population")

# Create a bar chart
chart = alt.Chart(top_10).mark_bar(color='dodgerblue').encode(
    x=alt.X("Location:N", title="Country/Territory", sort='-y', axis=alt.Axis(labelAngle=0)),  # Set labelAngle to 0
    y=alt.Y("Population:Q", title="Population"),
    tooltip=["Location", "Population", "Percentage_of_World"]
).properties(
    title="Top 10 Most Populous Countries in Comparison with the World's Population",
    width=800,
    height=400
)

# Display the chart
chart.show()


In [19]:
import altair as alt

# Filter out the "World" entry
filtered_data = population_table[population_table["Location"] != "World"]

# Select the top 10 most populous countries (excluding "World")
top_10_filtered = filtered_data.nlargest(10, "Population")

# Create a bar chart
chart = alt.Chart(top_10_filtered).mark_bar(color='dodgerblue').encode(
    x=alt.X("Location:N", title="Country/Territory", sort='-y', axis=alt.Axis(labelAngle=0)),  # Horizontal labels
    y=alt.Y("Population:Q", title="Population"),
    tooltip=["Location", "Population", "Percentage_of_World"]
).properties(
    title="Top 10 Most Populous Countries",
    width=800,
    height=400
)

# Display the chart
chart.show()


These charts highlight the population distribution of countries globally, providing insights into demographic patterns with the top ten populous nations clearly visualised.