In [None]:
# ================== Environment & Packages ==================
# Python 3.8+ environment required.
# Required packages: pandas, numpy, pycountry, scikit-learn (for IterativeImputer).
# Example: pip install pandas numpy pycountry scikit-learn
#
# Data source: Preprocessed raw country-year panel CSVs.
# WDI archives download: https://datatopics.worldbank.org/world-development-indicators/wdi-archives.html

# ================== Preprocessing Summary ==================
# This script performs the following preprocessing steps for each WDI data vintage:
# 1. Read and merge economic and governance indicator panels.
# 2. Harmonize indicator names and country names (using ISO3 and manual mapping).
# 3. Forward- and backward-fill missing numeric data within each country.
# 4. Set non-positive values (for CO2, FDI, GDP) to NaN and refill.
# 5. Impute missing 'credit' by year-median and then overall median if necessary.
# 6. Log-transform CO2, FDI, GDP, and round all numerics.
# 7. Export a cleaned, harmonized panel.
# 8. Compute first differences for all numeric columns and export for dynamic panel modeling.

import pandas as pd
import numpy as np
import pycountry
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ---------- File and variable parameters ----------
raw_files = {
    '2020': 'C1_raw_2020-04-09.csv',
    '2023': 'C1_raw_2023-06-29.csv'
}
wgi_file = 'WGI_1996_2016.csv'
# Manual mapping of country names for harmonization
override_map = {
    'United States of America':        'United States',
    'Russian Federation':              'Russia',
    'Korea, Republic of':              'South Korea',
    'Iran, Islamic Republic of':       'Iran',
    'Türkiye':                         'Turkey',
    'Venezuela, Bolivarian Republic of':'Venezuela',
    'Egypt, Arab Republic of':         'Egypt'
}

# Mapping WDI indicator codes to output column names
rename_map = {
    'EN.ATM.CO2E.PC':'co2',
    'NY.GDP.PCAP.CD':'gdp',
    'BX.KLT.DINV.WD.GD.ZS':'fdi',
    'EG.USE.PCAP.KG.OE':'energy',
    'FS.AST.DOMS.GD.ZS':'credit',
    'NE.TRD.GNFS.ZS':'trade',
    'NE.GDI.TOTL.ZS':'investment'
}

econ_cols = list(rename_map.values())
wgi_cols = ['gov_effect','corruption','rule_of_law','voice','stability','reg_quality']
all_num = econ_cols + wgi_cols

# ---------- Read WGI governance indicators and harmonize columns ----------
wgi = pd.read_csv(wgi_file).rename(columns={
    'iso3':'country',
    'GE.EST':'gov_effect',
    'CC.EST':'corruption',
    'RL.EST':'rule_of_law',
    'VA.EST':'voice',
    'PV.EST':'stability',
    'RQ.EST':'reg_quality'
})

# ---------- Main processing loop for each data vintage ----------
for tag, raw_path in raw_files.items():
    # 1. Read raw panel
    raw = pd.read_csv(raw_path)
    # 2. Rename economic indicators
    raw = raw.rename(columns=rename_map)
    # 3. Merge WGI indicators (by country and year)
    df = raw.merge(
        wgi[['country','year'] + wgi_cols],
        on=['country','year'], how='left'
    )
    # 4. Panel fill: sort and fill numeric columns forward/backward within country
    df = df.sort_values(['country','year'])
    df[all_num] = (
        df.groupby('country', group_keys=False)[all_num]
          .apply(lambda g: g.ffill().bfill())
    )

    # 5. Set non-positive (<=0) values to NaN for core economic indicators, then refill
    for c in ['co2','fdi','gdp']:
        df.loc[df[c] <= 0, c] = np.nan
    df[all_num] = (
        df.groupby('country', group_keys=False)[all_num]
          .apply(lambda g: g.ffill().bfill())
    )

    # 6. Fill missing credit values: first by year-median, then overall median if still missing
    df['credit'] = df.groupby('year')['credit'] \
                     .transform(lambda x: x.fillna(x.median()))
    overall_med = df['credit'].median()
    df['credit'] = df['credit'].fillna(overall_med)

    # 7. Log-transform core indicators and round all numerics to 6 decimals
    df['ln_co2'] = df['co2'].apply(lambda x: np.log(x) if x>0 else np.nan)
    df['ln_fdi'] = df['fdi'].apply(lambda x: np.log(x) if x>0 else np.nan)
    df['ln_gdp'] = df['gdp'].apply(lambda x: np.log(x) if x>0 else np.nan)
    for c in all_num + ['ln_co2','ln_fdi','ln_gdp']:
        df[c] = df[c].round(6)

    # 8. Convert ISO3 country codes to harmonized English names
    def iso3_to_name(code):
        country = pycountry.countries.get(alpha_3=code)
        name = country.name if country else code
        return override_map.get(name, name)
    df['country'] = df['country'].apply(iso3_to_name)

    # 9. Export cleaned, harmonized panel
    final_cols = [
        'country','year',
        'co2','fdi','gov_effect','gdp',
        'ln_co2','ln_fdi','ln_gdp',
        'credit','energy',
        'corruption','rule_of_law',
        'trade','voice','stability',
        'reg_quality','investment'
    ]
    df_final = df[final_cols]
    out_name = f'C1_cleaned_data_{tag}.csv'
    df_final.to_csv(out_name, index=False)
    print(f"✅ {out_name} generated — {len(df_final)} rows")

    # 10. Generate first-differenced dataset for dynamic panel models
    df_diff = df_final.copy()
    diff_cols = [
        'co2','fdi','gov_effect','gdp',
        'ln_co2','ln_fdi','ln_gdp',
        'credit','energy','corruption',
        'rule_of_law','trade','voice',
        'stability','reg_quality','investment'
    ]
    df_diff = df_diff.sort_values(['country','year'])
    df_diff[diff_cols] = (
        df_diff.groupby('country', group_keys=False)[diff_cols]
               .diff()
    )
    for col in diff_cols:
        df_diff.rename(columns={col: f'D_{col}'}, inplace=True)

    # 11. Export the differenced dataset
    diff_cols_prefixed = [f'D_{col}' for col in diff_cols]
    out_name_diff = f'C1_diff_data_{tag}.csv'
    df_diff[['country','year'] + diff_cols_prefixed] \
        .to_csv(out_name_diff, index=False)
    print(f"✅ {out_name_diff} generated — {len(df_diff)} rows")
