## Import data and libaries

In [2]:
# Import the libraries we need
import pandas as pd

# Read the World Bank development indicators dataset

worldbank_path = "../additional_material/week_05_world_bank_development_indicators.csv"
worldbank = pd.read_csv(worldbank_path)

# Read the income dataset (Excel file)
income_path = "../additional_material/week_05_income.xlsx"
income = pd.read_excel(income_path)

# Show the first rows of each dataset to confirm that everything loaded correctly
print("World Bank indicators dataset:")
print(worldbank.head())

print("\nIncome dataset:")
print(income.head())


World Bank indicators dataset:
       country        date  agricultural_land%  forest_land%  land_area  \
0  Afghanistan  1960-01-01                 NaN           NaN        NaN   
1  Afghanistan  1961-01-01           57.878356           NaN   652230.0   
2  Afghanistan  1962-01-01           57.955016           NaN   652230.0   
3  Afghanistan  1963-01-01           58.031676           NaN   652230.0   
4  Afghanistan  1964-01-01           58.116002           NaN   652230.0   

   avg_precipitation  trade_in_services%  control_of_corruption_estimate  \
0                NaN                 NaN                             NaN   
1              327.0                 NaN                             NaN   
2              327.0                 NaN                             NaN   
3              327.0                 NaN                             NaN   
4              327.0                 NaN                             NaN   

   control_of_corruption_std  access_to_electricity%  ...  \


## Are the countries equivalent in both datasets?

In [3]:
# 1. Keep only real countries in the income dataset
#    Idea: real countries have an "Income group", regions usually do NOT
income_countries_df = income[income["Income group"].notna()]

# 2. Get the list of country names from both datasets
income_countries = income_countries_df["Economy"].dropna().unique()
worldbank_countries = worldbank["country"].dropna().unique()

print("Number of countries in income dataset:", len(income_countries))
print("Number of countries in World Bank dataset:", len(worldbank_countries))

# 3. Convert to sets for an easy comparison
income_countries_set = set(income_countries)
worldbank_countries_set = set(worldbank_countries)

# 4. Countries that are in income.xlsx but NOT in the World Bank indicators
only_in_income = income_countries_set - worldbank_countries_set

# 5. Countries that are in the World Bank indicators but NOT in income.xlsx
only_in_worldbank = worldbank_countries_set - income_countries_set

print("\nCountries only in income dataset (not in World Bank):")
print(sorted(only_in_income))

print("\nCountries only in World Bank dataset (not in income dataset):")
print(sorted(only_in_worldbank))


Number of countries in income dataset: 216
Number of countries in World Bank dataset: 274

Countries only in income dataset (not in World Bank):
['Curaçao', 'Côte d’Ivoire', 'Puerto Rico (U.S.)', 'Somalia, Fed. Rep.', 'São Tomé and Príncipe', 'Taiwan, China', 'Türkiye']

Countries only in World Bank dataset (not in income dataset):
['Africa Eastern and Southern', 'Africa Western and Central', 'Arab World', 'Caribbean small states', 'Central Europe and the Baltics', "Cote d'Ivoire", 'Curacao', 'Czech Republic', 'Early-demographic dividend', 'East Asia & Pacific', 'East Asia & Pacific (IDA & IBRD countries)', 'East Asia & Pacific (IDA & IBRD)', 'East Asia & Pacific (excluding high income)', 'Ethiopia', 'Euro area', 'Europe & Central Asia', 'Europe & Central Asia (IDA & IBRD countries)', 'Europe & Central Asia (IDA & IBRD)', 'Europe & Central Asia (excluding high income)', 'European Union', 'Fragile and conflict affected situations', 'Heavily indebted poor countries (HIPC)', 'High income'