In [None]:
# ---------------------
# Step 1: Import Libraries
# ---------------------
# Import necessary libraries: requests, BeautifulSoup, pandas, matplotlib/seaborn.

# ---------------------
# Step 2: Request Data
# ---------------------
# Make an HTTP GET request to the Wikipedia URL containing the age-wise literacy data.

# ---------------------
# Step 3: Parse the HTML
# ---------------------
# Use BeautifulSoup to parse the page.
# Locate the specific table that includes age-wise literacy rates by country.

# ---------------------
# Step 4: Extract Data from Table
# ---------------------
# For each country, extract columns such as:
# - Country Name
# - Year
# - Total Literacy Rate
# - Youth Literacy Rate
# - Adult Literacy Rate
# - Elderly Literacy Rate (if present)
# Ensure column consistency and handle missing values appropriately.

# ---------------------
# Step 5: Save Data to CSV
# ---------------------
# Store the structured data in a CSV file with proper headers.

# ---------------------
# Step 6: Load CSV for Analysis
# ---------------------
# Read the CSV using pandas and verify data integrity (data types, null values).

# ---------------------
# Step 7: Data Cleaning
# ---------------------
# Clean the data:
# - Convert percentages to numeric format.
# - Handle null or inconsistent values.
# - Filter out rows with missing 'Total Literacy Rate' or Year.

# ---------------------
# Step 8: Univariate Analysis
# ---------------------
# Analyze individual literacy metrics:
# - Distribution of total literacy rates
# - Compare youth vs adult vs elderly literacy rates globally
# - Use histograms, boxplots

# ---------------------
# Step 9: Bivariate Analysis
# ---------------------
# Explore relationships such as:
# - Youth vs Adult literacy rate (scatterplot)
# - Literacy rate vs year (line plots or trends if historical)
# - Total vs elderly literacy rate to explore generational shifts

# ---------------------
# Step 10: Grouped Analysis
# ---------------------
# Optional: Map countries to continents/regions manually or via a lookup.
# - Compare regional averages for each age group.
# - Highlight global trends by region.

# ---------------------
# Step 11: Descriptive Statistics
# ---------------------
# Calculate mean, median, std. deviation for each literacy category (youth, adult, elderly).
# Identify countries with literacy gaps between age groups.

# ---------------------
# Step 12: Define & Answer Key Questions
# ---------------------
# Use visualizations and analysis to answer:

# 1. Which countries have the highest and lowest total literacy rates?
# 2. Which countries have the largest gaps between youth and elderly literacy rates?
# 3. What is the global average youth literacy rate?
# 4. Are countries improving literacy over time? (if historical data exists)
# 5. Which countries have high youth literacy but low elderly literacy?
# 6. Are there countries with uniformly high literacy across all age groups?
# 7. What age group has the most variation in literacy across countries?

# ---------------------
# Step 13: Visualizations
# ---------------------
# Create plots:
# - Bar chart: Top 10 countries by youth literacy
# - Boxplot: Literacy rate by age group
# - Scatter plot: Youth vs Elderly literacy
# - Heatmap (optional): Literacy rates across age groups by region

# ---------------------
# Step 14: Summary of Insights
# ---------------------
# Write your interpretations based on the visualizations.
# Discuss countries doing well, those lagging behind, and age groups needing focus.

# ---------------------
# Step 15: Save Results
# ---------------------
# Save your CSV and visualizations for final submission.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [4]:
tables = soup.find_all("table", {"class": "wikitable"})

# Try to find the correct table by looking for keywords
target_table = None
for table in tables:
    if "Literacy rate" in table.text and "Country" in table.text:
        target_table = table
        break

In [5]:
data = []
if target_table:
    rows = target_table.find_all("tr")
    headers = [header.text.strip() for header in rows[0].find_all(["th", "td"])]
    for row in rows[1:]:
        cols = [col.text.strip() for col in row.find_all(["td", "th"])]
        if len(cols) >= 4:
            country = cols[0]
            year = cols[1]
            total = cols[2]
            male = cols[3] if len(cols) > 3 else None
            female = cols[4] if len(cols) > 4 else None
            data.append([country, year, total, male, female])

df = pd.DataFrame(data, columns=["Country", "Year", "Total Literacy Rate", "Male Literacy Rate", "Female Literacy Rate"])

In [6]:
df.to_csv("literacy_data_raw.csv", index=False)

In [7]:
df = pd.read_csv("literacy_data_raw.csv")

In [8]:
for col in ["Total Literacy Rate", "Male Literacy Rate", "Female Literacy Rate"]:
    df[col] = df[col].str.replace("%", "").str.replace("—", "").str.strip()
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df.dropna(subset=["Total Literacy Rate", "Year"], inplace=True)

In [12]:
plt.figure(figsize=(10, 6))
sns.histplot(df["Total Literacy Rate"], bins=20, kde=True)
plt.title("Distribution of Total Literacy Rate")
plt.xlabel("Literacy Rate (%)")
plt.ylabel("Frequency")
plt.savefig("total_literacy_distribution.png")
plt.close()

In [13]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Male Literacy Rate", y="Female Literacy Rate")
plt.title("Male vs Female Literacy Rate")
plt.xlabel("Male Literacy Rate (%)")
plt.ylabel("Female Literacy Rate (%)")
plt.savefig("male_vs_female_literacy.png")
plt.close()

In [14]:
stats = df.describe()
stats.to_csv("literacy_statistics.csv")

In [15]:
highest = df[df["Total Literacy Rate"] == df["Total Literacy Rate"].max()]
lowest = df[df["Total Literacy Rate"] == df["Total Literacy Rate"].min()]
global_avg_youth = df["Total Literacy Rate"].mean()

print("Highest Literacy Rate Country:\n", highest)
print("Lowest Literacy Rate Country:\n", lowest)
print("Global Average Literacy Rate: {:.2f}%".format(global_avg_youth))

Highest Literacy Rate Country:
 Empty DataFrame
Columns: [Country, Year, Total Literacy Rate, Male Literacy Rate, Female Literacy Rate]
Index: []
Lowest Literacy Rate Country:
 Empty DataFrame
Columns: [Country, Year, Total Literacy Rate, Male Literacy Rate, Female Literacy Rate]
Index: []
Global Average Literacy Rate: nan%


In [16]:
top_10 = df.sort_values("Total Literacy Rate", ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(data=top_10, x="Country", y="Total Literacy Rate")
plt.xticks(rotation=45)
plt.title("Top 10 Countries by Total Literacy Rate")
plt.savefig("top_10_total_literacy.png")
plt.close()

In [19]:
summary = ""

if not highest.empty:
    top_country = f"{highest.iloc[0]['Country']} - {highest.iloc[0]['Total Literacy Rate']}%"
else:
    top_country = "No data available"

if not lowest.empty:
    bottom_country = f"{lowest.iloc[0]['Country']} - {lowest.iloc[0]['Total Literacy Rate']}%"
else:
    bottom_country = "No data available"

summary = f"""
Top Country: {top_country}
Bottom Country: {bottom_country}
Global Average Literacy: {global_avg_youth:.2f}%
"""

with open("literacy_summary.txt", "w") as f:
    f.write(summary)

In [22]:
df.to_csv("literacy_data_cleaned.csv", index=False)

In [24]:
cleaned = pd.read_csv("literacy_data_cleaned.csv")
cleaned["_Section_"] = "Cleaned Data"
stats = pd.read_csv("literacy_statistics.csv")
stats["_Section_"] = "Statistics"
summary_df = pd.DataFrame({
    "Country": ["Summary"],
    "Year": [""],
    "Total Literacy Rate": [global_avg_youth],
    "Male Literacy Rate": [""],
    "Female Literacy Rate": [""],
    "_Section_": ["Summary"]
})
combined = pd.concat([
    cleaned[["Country", "Year", "Total Literacy Rate", "Male Literacy Rate", "Female Literacy Rate", "_Section_"]],
    stats.rename(columns={"Unnamed: 0": "Country"}).reindex(columns=["Country", "Year", "Total Literacy Rate", "Male Literacy Rate", "Female Literacy Rate", "_Section_"]),
    summary_df
], ignore_index=True)
combined.to_csv("literacy_full_combined.csv", index=False)

  combined = pd.concat([
