# Minimal Reproduction: Hodler & Raschky (2014)
# *Regional Favoritism*

**Main specification:** Do leaders channel economic resources to their birth regions?

$$\ln(\text{light}_{i,t}) = \beta \cdot \text{BirthRegionLeader}_{i,t} + \alpha_i + \gamma_{c,t} + \varepsilon_{i,t}$$

- $\alpha_i$: region (ADM1) fixed effects  
- $\gamma_{c,t}$: country $\times$ year fixed effects  
- $\text{BirthRegionLeader}_{i,t} = 1$ if region $i$ is the birth region of the national leader in year $t$

**Data:**
- Political leaders: PLAD (Bomprezzi et al., 2025)
- Nightlights: Harmonized DMSP-OLS / VIIRS (Li et al., 2020)
- Admin boundaries: GADM v4.1

In [2]:
import os
import warnings
import zipfile
import urllib.request
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
from linearmodels.panel import PanelOLS

warnings.filterwarnings("ignore")

ROOT = Path("..")  # project root
DATA = ROOT / "data"
NTL_DIR = DATA / "nightlights"
PLAD_PATH = DATA / "political leaders" / "PLAD_April_2024.dta"

## 1. Download GADM ADM1 boundaries

In [5]:
GADM_DIR = DATA / "gadm"
GADM_GPKG = GADM_DIR / "gadm_410.gpkg"
ADM1_CACHE = GADM_DIR / "gadm41_adm1.gpkg"

# --- Download GADM if needed ---
if not GADM_GPKG.exists() and not ADM1_CACHE.exists():
    GADM_DIR.mkdir(parents=True, exist_ok=True)
    url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gpkg.zip"
    zip_path = GADM_DIR / "gadm_410-gpkg.zip"
    print("Downloading GADM (~1.4 GB) — this may take a few minutes...")
    urllib.request.urlretrieve(url, zip_path)
    print("Extracting...")
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(GADM_DIR)
    zip_path.unlink()
    print("Done.")

# --- Dissolve to ADM1 level and cache ---
if ADM1_CACHE.exists():
    print(f"Loading cached ADM1 boundaries from {ADM1_CACHE}")
    adm1 = gpd.read_file(ADM1_CACHE)
else:
    print("Dissolving to ADM1 (this takes a few minutes on the first run)...")
    raw = gpd.read_file(GADM_GPKG)
    adm1 = raw.dissolve(by="GID_1", as_index=False)[["GID_0", "GID_1", "NAME_0", "NAME_1", "geometry"]]
    adm1.to_file(ADM1_CACHE, driver="GPKG")
    print(f"Cached ADM1 boundaries to {ADM1_CACHE}")

print(f"Loaded {len(adm1)} ADM1 regions")
adm1.head(2)

Loading cached ADM1 boundaries from ../data/gadm/gadm41_adm1.gpkg
Loaded 3662 ADM1 regions


Unnamed: 0,GID_0,GID_1,NAME_0,NAME_1,geometry
0,ATA,,Antarctica,,"MULTIPOLYGON (((-169.00626 -83.61875, -169.004..."
1,UKR,?,Ukraine,?,"MULTIPOLYGON (((30.59167 50.41236, 30.60611 50..."


## 2. Compute mean nightlights per ADM1-year (zonal statistics)

We compute the mean nightlight intensity within each ADM1 polygon for each year of the harmonized DMSP rasters (1992–2013, matching the original paper's DMSP era).

In [6]:
PANEL_CACHE = DATA / "nightlights_adm1_panel.parquet"
YEARS = range(1992, 2014)  # 1992-2013 inclusive (DMSP era)

def raster_path(year):
    """Return the path to the harmonized nightlights raster for a given year."""
    if year <= 2013:
        return NTL_DIR / f"Harmonized_DN_NTL_{year}_calDMSP.tif"
    else:
        return NTL_DIR / f"Harmonized_DN_NTL_{year}_simVIIRS.tif"

if PANEL_CACHE.exists():
    print(f"Loading cached panel from {PANEL_CACHE}")
    ntl_panel = pd.read_parquet(PANEL_CACHE)
else:
    frames = []
    for year in YEARS:
        rpath = raster_path(year)
        if not rpath.exists():
            print(f"  Skipping {year} — raster not found")
            continue
        print(f"  Processing {year}...", end=" ", flush=True)
        stats = zonal_stats(
            adm1.geometry,
            str(rpath),
            stats=["mean"],
            nodata=0,
        )
        means = [s["mean"] for s in stats]
        df_year = pd.DataFrame({
            "GID_1": adm1["GID_1"],
            "GID_0": adm1["GID_0"],
            "year": year,
            "ntl_mean": means,
        })
        frames.append(df_year)
        print(f"done (non-null: {df_year['ntl_mean'].notna().sum()})")

    ntl_panel = pd.concat(frames, ignore_index=True)
    ntl_panel.to_parquet(PANEL_CACHE)
    print(f"\nSaved panel to {PANEL_CACHE}")

print(f"Panel shape: {ntl_panel.shape}")
ntl_panel.head()

Loading cached panel from ../data/nightlights_adm1_panel.parquet
Panel shape: (80564, 4)


Unnamed: 0,GID_1,GID_0,year,ntl_mean
0,,ATA,1992,11.698439
1,?,UKR,1992,27.01626
2,AFG.10_1,AFG,1992,
3,AFG.11_1,AFG,1992,5.615385
4,AFG.12_1,AFG,1992,9.12069


## 3. Build treatment variable from PLAD

In [8]:
plad = pd.read_stata(PLAD_PATH)

# Keep domestic leaders with valid ADM1 identifiers
plad = plad[plad["foreign_leader"] == "0"].copy()
plad = plad[plad["gid_1"].str.strip() != "."].copy()
plad["startyear"] = plad["startyear"].astype(int)
plad["endyear"] = plad["endyear"].astype(int)

print(f"Leaders after filtering: {len(plad)}")
print(f"Unique countries: {plad['country'].nunique()}")
print(f"Year range: {plad['startyear'].min()}–{plad['endyear'].max()}")
plad[["leader", "country", "gid_1", "startyear", "endyear"]].head(10)

Leaders after filtering: 1241
Unique countries: 175
Year range: 1948–2023


Unnamed: 0,leader,country,gid_1,startyear,endyear
0,Najibullah,Afghanistan,AFG.26_1,1986,1992
1,Mojadidi,Afghanistan,AFG.14_1,1992,1992
2,Burhanuddin Rabbani,Afghanistan,AFG.1_1,1992,1996
3,Mullah Omar,Afghanistan,AFG.15_1,1996,2001
4,Hamid Karzai,Afghanistan,AFG.15_1,2001,2014
5,Ashraf Ghani Ahmadzai,Afghanistan,AFG.14_1,2014,2021
6,Hibatullah Akhundzada,Afghanistan,AFG.15_1,2021,2023
7,Alia,Albania,ALB.10_1,1985,1992
8,Berisha,Albania,ALB.8_1,1992,1997
9,Fatos Nano,Albania,ALB.11_1,1997,1998


In [9]:
# Expand leader spells into leader-year rows, then map to birth region-year
rows = []
for _, row in plad.iterrows():
    for y in range(max(row["startyear"], 1992), min(row["endyear"], 2013) + 1):
        rows.append({"GID_1": row["gid_1"], "GID_0": row["gid_0"], "year": y})

leader_years = pd.DataFrame(rows).drop_duplicates(subset=["GID_1", "year"])
leader_years["birth_region_leader"] = 1

print(f"Birth-region × year observations: {len(leader_years)}")
leader_years.head()

Birth-region × year observations: 4004


Unnamed: 0,GID_1,GID_0,year,birth_region_leader
0,AFG.26_1,AFG,1992,1
1,AFG.14_1,AFG,1992,1
2,AFG.1_1,AFG,1992,1
3,AFG.1_1,AFG,1993,1
4,AFG.1_1,AFG,1994,1


In [10]:
# Merge treatment onto panel
panel = ntl_panel.merge(leader_years[["GID_1", "year", "birth_region_leader"]],
                        on=["GID_1", "year"], how="left")
panel["birth_region_leader"] = panel["birth_region_leader"].fillna(0).astype(int)

# Construct log nightlights (add small constant to handle zeros)
panel["ln_ntl"] = np.log(panel["ntl_mean"] + 0.01)

# Create country-year identifier for FE
panel["country_year"] = panel["GID_0"] + "_" + panel["year"].astype(str)

# Drop rows with missing nightlights
panel = panel.dropna(subset=["ntl_mean"])

print(f"Final panel: {panel.shape[0]} obs, {panel['GID_1'].nunique()} regions, "
      f"{panel['year'].nunique()} years")
print(f"Treated obs (birth_region_leader=1): {panel['birth_region_leader'].sum()}")
print(f"Share treated: {panel['birth_region_leader'].mean():.4f}")
panel.head()

Final panel: 76111 obs, 3593 regions, 22 years
Treated obs (birth_region_leader=1): 3816
Share treated: 0.0501


Unnamed: 0,GID_1,GID_0,year,ntl_mean,birth_region_leader,ln_ntl,country_year
0,,ATA,1992,11.698439,0,2.46031,ATA_1992
1,?,UKR,1992,27.01626,0,3.296809,UKR_1992
3,AFG.11_1,AFG,1992,5.615385,0,1.727289,AFG_1992
4,AFG.12_1,AFG,1992,9.12069,0,2.211641,AFG_1992
5,AFG.13_1,AFG,1992,12.726562,0,2.544477,AFG_1992


## 4. Main regression

$$\ln(\text{light}_{i,t}) = \beta \cdot \text{BirthRegionLeader}_{i,t} + \alpha_i + \gamma_{c,t} + \varepsilon_{i,t}$$

Standard errors clustered at the country level, following Hodler & Raschky (2014).

In [11]:
# Main specification: region FE + country×year FE, clustered at country level
if not isinstance(panel.index, pd.MultiIndex):
    panel = panel.set_index(["GID_1", "year"])

model = PanelOLS.from_formula(
    "ln_ntl ~ birth_region_leader + EntityEffects",
    data=panel,
    other_effects=panel["country_year"],
    drop_absorbed=True,
)
result = model.fit(cov_type="clustered", clusters=panel["GID_0"])
print(result.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                 ln_ntl   R-squared:                     2.034e-05
Estimator:                   PanelOLS   R-squared (Between):              0.0001
No. Observations:               76111   R-squared (Within):            5.507e-06
Date:                Thu, Feb 12 2026   R-squared (Overall):              0.0001
Time:                        23:44:53   Log-likelihood                 6.339e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1.3708
Entities:                        3593   P-value                           0.2417
Avg Obs:                       21.183   Distribution:                 F(1,67405)
Min Obs:                       1.0000                                           
Max Obs:                       22.000   F-statistic (robust):             0.4676
                            

## 5. Democracy interaction

Hodler & Raschky's key finding is that regional favoritism is concentrated in **autocracies**. We interact `birth_region_leader` with a democracy index to test this.

**Data:** V-Dem Electoral Democracy Index (`v2x_polyarchy`, 0–1).  
Download the **Country-Year Core** CSV from [v-dem.net/data](https://v-dem.net/data/the-v-dem-dataset/) (free, requires quick registration) and place the CSV file in `data/vdem/`.

In [12]:
VDEM_DIR = DATA / "vdem"
vdem_csv = list(VDEM_DIR.glob("*.csv")) if VDEM_DIR.exists() else []
assert vdem_csv, (
    "V-Dem CSV not found. Download the Country-Year Core CSV from "
    "https://v-dem.net/data/the-v-dem-dataset/ and place it in data/vdem/"
)

vdem = pd.read_csv(vdem_csv[0], low_memory=False)
# Keep only what we need: country code, year, polyarchy index
vdem = vdem[["country_text_id", "year", "v2x_polyarchy"]].rename(
    columns={"country_text_id": "GID_0", "v2x_polyarchy": "democracy"}
)
vdem = vdem.dropna(subset=["democracy"])
vdem = vdem[(vdem["year"] >= 1992) & (vdem["year"] <= 2013)]

print(f"V-Dem: {len(vdem)} country-year obs, {vdem['GID_0'].nunique()} countries")
vdem.head()

V-Dem: 3890 country-year obs, 179 countries


Unnamed: 0,GID_0,year,democracy
203,MEX,1992,0.437
204,MEX,1993,0.447
205,MEX,1994,0.47
206,MEX,1995,0.48
207,MEX,1996,0.508


In [13]:
# Merge democracy scores onto panel and build interaction
panel_dem = panel.reset_index().merge(vdem, on=["GID_0", "year"], how="left")
panel_dem = panel_dem.dropna(subset=["democracy"])
panel_dem["birth_x_democracy"] = panel_dem["birth_region_leader"] * panel_dem["democracy"]

print(f"Panel with democracy: {len(panel_dem)} obs")
panel_dem = panel_dem.set_index(["GID_1", "year"])

# Interaction specification:
# ln(light) = β1 * BirthRegionLeader + β2 * BirthRegionLeader × Democracy + α_i + γ_ct + ε
# β1 > 0: favoritism in full autocracies (democracy = 0)
# β2 < 0: favoritism attenuated in democracies
model_dem = PanelOLS.from_formula(
    "ln_ntl ~ birth_region_leader + birth_x_democracy + EntityEffects",
    data=panel_dem,
    other_effects=panel_dem["country_year"],
    drop_absorbed=True,
)
result_dem = model_dem.fit(cov_type="clustered", clusters=panel_dem["GID_0"])
print(result_dem.summary)

Panel with democracy: 66014 obs
                          PanelOLS Estimation Summary                           
Dep. Variable:                 ln_ntl   R-squared:                     5.922e-05
Estimator:                   PanelOLS   R-squared (Between):              0.0002
No. Observations:               66014   R-squared (Within):              -0.0001
Date:                Thu, Feb 12 2026   R-squared (Overall):              0.0002
Time:                        23:45:02   Log-likelihood                 5.489e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1.7506
Entities:                        3115   P-value                           0.1737
Avg Obs:                       21.192   Distribution:                 F(2,59121)
Min Obs:                       1.0000                                           
Max Obs:                       22.000   F-statistic (robust):             0.5