In [3]:
# Package and Environment Configuration:
# Python 3.8+ environment with the following libraries installed:
#   pandas, openpyxl
# Data Source:
# Download the WDI archive files from:
# https://datatopics.worldbank.org/world-development-indicators/wdi-archives.html

import pandas as pd  # data manipulation
import zipfile        # handle zip archives
import io             # in-memory byte streams
import openpyxl       # Excel reading

# ---------- Parameters ----------
# Mapping tags to local ZIP filenames for each WDI release date
zip_files = {
    '2020-04-09': 'WDI_excel_2020_04_09.zip',
    '2023-06-29': 'WDI_excel_2023_06_29.zip'
}

# List of ISO country codes to extract
countries = [
    'AUS','BRA','CAN','CHN','FRA','DEU','IND','IDN','IRN','ITA',
    'JPN','MEX','POL','RUS','SAU','ZAF','KOR','ESP','TUR','GBR','USA'
]

# WDI indicator codes of interest
indicators = [
    'EN.ATM.CO2E.PC',   # CO₂ emissions per capita
    'BX.KLT.DINV.WD.GD.ZS',  # FDI net inflows (% of GDP)
    'NY.GDP.PCAP.CD',   # GDP per capita (current US$)
    'EG.USE.PCAP.KG.OE',# Energy use per capita
    'NE.TRD.GNFS.ZS',   # Exports and imports of goods and services (% of GDP)
    'FS.AST.DOMS.GD.ZS',# Domestic credit to private sector (% of GDP)
    'NE.GDI.TOTL.ZS'    # Gross capital formation (% of GDP)
    'GB.XPD.RSDV.GD.ZS'
]

# Years of interest (1990–2016)
years = [str(y) for y in range(1990, 2017)]


def tidy_from_zip(tag, zfile_path):
    """
    Extracts and tidies WDI data from a ZIP archive.
    - Reads the first Excel file inside the ZIP
    - Filters for selected countries, indicators, and years
    - Converts wide format (years as columns) to long format
    - Pivots to have one row per country-year
    - Writes the output CSV 'C1_raw_<tag>.csv'
    """
    # 1. Open the ZIP and read the Excel workbook bytes
    with zipfile.ZipFile(zfile_path, 'r') as z:
        # Identify spreadsheet inside archive
        xls_name = next(
            n for n in z.namelist()
            if n.lower().endswith(('.xls', '.xlsx'))
        )
        data_bytes = z.read(xls_name)

    # 2. Load the workbook in read-only mode
    wb = openpyxl.load_workbook(io.BytesIO(data_bytes), read_only=True, data_only=True)
    ws = wb['Data']  # Use the 'Data' sheet

    # 3. Read all rows and locate the header line by known markers
    rows = list(ws.iter_rows(values_only=True))
    header_idx = next(
        i for i, row in enumerate(rows)
        if row[1] == 'Country Code' and row[3] == 'Indicator Code'
    )
    header = rows[header_idx]

    # 4. Map each desired year to its column index
    year_to_col = {
        str(year).strip(): idx
        for idx, year in enumerate(header)
        if str(year).strip() in years
    }

    records = []
    # 5. Iterate over data rows below the header
    for row in rows[header_idx + 1:]:
        cc = str(row[1]).strip() if row[1] else None
        ic = str(row[3]).strip() if row[3] else None
        if cc in countries and ic in indicators:
            # Extract values for each year
            vals = [row[year_to_col[yr]] for yr in years]
            records.append([cc, ic] + vals)

    # 6. Build a wide DataFrame and pivot to tidy (long) format
    cols = ['country', 'indicator'] + years
    wide = pd.DataFrame(records, columns=cols)
    tidy = (
        wide
        .melt(id_vars=['country','indicator'], var_name='year', value_name='value')
        .pivot_table(
            index=['country','year'],
            columns='indicator',
            values='value'
        )
        .reset_index()
    )

    # 7. Save the tidied CSV file
    out_name = f'C1_raw_{tag}.csv'
    tidy.to_csv(out_name, index=False)
    print(f'Saved {out_name} — {len(tidy):,} rows')


# ---------- Execute ----------
for tag, path in zip_files.items():
    tidy_from_zip(tag, path)


Saved C1_raw_2020-04-09.csv — 567 rows
Saved C1_raw_2023-06-29.csv — 567 rows
