# Task 1: Data Exploration & Enrichment
Objective: Understand schema, explore starter data, add high-value observations/events, document changes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 120)

In [2]:
df = pd.read_csv("../data/raw/ethiopia_fi_unified_data.csv", low_memory=False)
ref_codes = pd.read_csv("../data/raw/reference_codes.csv")

print("Shape:", df.shape)
print(df.record_type.value_counts(dropna=False))
print("\nUnique pillars:", df.pillar.unique())
print("\nUnique indicator_codes:", sorted(df.indicator_code.dropna().unique()))

Shape: (43, 34)
record_type
observation    30
event          10
target          3
Name: count, dtype: int64

Unique pillars: <StringArray>
['ACCESS', 'USAGE', 'AFFORDABILITY', 'GENDER', nan]
Length: 5, dtype: str

Unique indicator_codes: ['ACC_4G_COV', 'ACC_FAYDA', 'ACC_MM_ACCOUNT', 'ACC_MOBILE_PEN', 'ACC_OWNERSHIP', 'AFF_DATA_INCOME', 'EVT_CROSSOVER', 'EVT_ETHIOPAY', 'EVT_FAYDA', 'EVT_FX_REFORM', 'EVT_MPESA', 'EVT_MPESA_INTEROP', 'EVT_NFIS2', 'EVT_SAFARICOM', 'EVT_SAFCOM_PRICE', 'EVT_TELEBIRR', 'GEN_GAP_ACC', 'GEN_GAP_MOBILE', 'GEN_MM_SHARE', 'USG_ACTIVE_RATE', 'USG_ATM_COUNT', 'USG_ATM_VALUE', 'USG_CROSSOVER', 'USG_MPESA_ACTIVE', 'USG_MPESA_USERS', 'USG_P2P_COUNT', 'USG_P2P_VALUE', 'USG_TELEBIRR_USERS', 'USG_TELEBIRR_VALUE']


## 4. Schema & Basic Counts

In [6]:
print("\nPillar distribution:\n", df_raw['pillar'].value_counts(dropna=False))
print("\nConfidence levels:\n", df_raw['confidence'].value_counts(dropna=False))
print("\nSource types (if present):\n", df_raw.get('source_type', pd.Series()).value_counts(dropna=False))

# Events only
events = df_raw[df_raw['record_type'] == 'event']
print("\nEvents count:", len(events))
print("Event categories:\n", events['category'].value_counts(dropna=False))

NameError: name 'df_raw' is not defined

## 6. Temporal Coverage

In [4]:
df_raw['year'] = df_raw['observation_date'].dt.year
temporal = df_raw[df_raw['record_type'] == 'observation'].groupby(['year', 'indicator_code']).size().unstack(fill_value=0)
print("Observations per year and indicator:\n", temporal)

# Simple heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(temporal.T, annot=True, cmap="YlGnBu", fmt="d")
plt.title("Observation Coverage by Year and Indicator")
plt.savefig("../reports/figures/temporal_coverage.png")
plt.show()

NameError: name 'df_raw' is not defined

## 8. Key Indicator Coverage – Focus on Access & Usage

In [None]:
acc_own = df_raw[(df_raw['indicator_code'] == 'ACC_OWNERSHIP') & (df_raw['record_type'] == 'observation')]
acc_own = acc_own.sort_values('observation_date')
print("Account Ownership history:\n", acc_own[['observation_date', 'value_numeric', 'source_name', 'source_url']])

plt.figure(figsize=(10, 5))
plt.plot(acc_own['observation_date'], acc_own['value_numeric'], marker='o', color='blue')
plt.title("Ethiopia Account Ownership Rate (Findex trajectory)")
plt.ylabel("% Adults (15+) with account")
plt.grid(True)
plt.savefig("../reports/figures/acc_ownership_historical.png")
plt.show()

In [None]:
mm_acc = df_raw[(df_raw['indicator_code'].isin(['ACC_MM_ACCOUNT', 'USG_DIGITAL_PAYMENT', 'USG_MOBILE_MONEY'])) & 
                (df_raw['record_type'] == 'observation')]
mm_acc = mm_acc.sort_values(['indicator_code', 'observation_date'])

print("Mobile Money & Digital Usage coverage:\n", mm_acc[['indicator_code', 'observation_date', 'value_numeric', 'source_name']])

## 11. Existing Impact Links Summary

In [None]:
impact = df_raw[df_raw['record_type'] == 'impact_link']
print("Impact links count:", len(impact))
print(impact[['parent_id', 'related_indicator', 'impact_direction', 'impact_magnitude', 'lag_months', 'evidence_basis']].head(10))

In [None]:
# New observations from recent sources (2024-2025 real data)
new_obs = [
    {'record_type': 'observation', 'indicator_code': 'ACC_OWNERSHIP', 'value_numeric': 49.0, 
     'observation_date': pd.to_datetime('2024-12-31'), 'source_name': 'World Bank Global Findex 2025', 
     'source_url': 'https://www.worldbank.org/en/publication/globalfindex', 'confidence': 'high', 
     'notes': 'Latest Findex; +3pp from 2021 despite massive registration growth', 'collected_by': 'YourName', 
     'collection_date': datetime.now().strftime('%Y-%m-%d')},
    
    {'record_type': 'observation', 'indicator_code': 'ACC_MM_ACCOUNT', 'value_numeric': 19.4, 
     'observation_date': pd.to_datetime('2024-12-31'), 'source_name': 'Global Findex 2025 (via Shega/BirrMetrics)', 
     'source_url': 'https://digitalfinance.shega.co/insights/articles/findex-2025-and-ethiopia-s-digital-financial-leap-momentum-without-maturity', 
     'confidence': 'high', 'notes': 'Mobile money account ownership jumped significantly 2021–2024'},
    
    {'record_type': 'observation', 'indicator_code': 'USG_DIGITAL_PAYMENT', 'value_numeric': 21.0, 
     'observation_date': pd.to_datetime('2024-12-31'), 'source_name': 'Global Findex 2025 (via BirrMetrics/LinkedIn ref)', 
     'source_url': 'https://www.linkedin.com/posts/birrmetrics_the-world-banks-global-findex-2025-is-out-activity-7351289714722717696-wyaj', 
     'confidence': 'medium', 'notes': 'Only ~21% made/received digital payment in 2024'},
    
    {'record_type': 'observation', 'indicator_code': 'MOBILE_MONEY_ACCOUNTS_TOTAL', 'value_numeric': 136000000, 
     'observation_date': pd.to_datetime('2025-12-01'), 'source_name': 'National Bank of Ethiopia / BirrMetrics', 
     'source_url': 'https://birrmetrics.com/ethiopias-mobile-money-users-reach-136-million-digital-transactions-top-96-trillion-birr', 
     'confidence': 'high', 'notes': 'Registered mobile money accounts – key supply-side proxy'},
]

# New events (policy / product)
new_events = [
    {'record_type': 'event', 'category': 'policy', 'observation_date': pd.to_datetime('2025-06-01'), 
     'source_name': 'Fayda mandatory linkage for accounts in Addis', 'source_url': 'https://www.worldbank.org/en/news/feature/2025/02/27/the-transformative-power-of-ethiopia-afe-digital-id-unlocking-a-better-future-for-all', 
     'confidence': 'high', 'notes': 'Digital ID linkage expected to ease KYC/account opening'},
    
    {'record_type': 'event', 'category': 'policy', 'observation_date': pd.to_datetime('2025-12-08'), 
     'source_name': 'National Digital Payments Strategy 2026-2030 Draft', 'source_url': 'https://nbe.gov.et/wp-content/uploads/2025/12/Ethiopia_NDPS_Draft_F.pdf', 
     'confidence': 'high', 'notes': 'Framework for instant payments, inclusion, interoperability'},
]

df_enriched = pd.concat([df_raw] + [pd.DataFrame(new_obs), pd.DataFrame(new_events)], ignore_index=True)
df_enriched.to_csv("../data/processed/ethiopia_fi_unified_enriched_20260131.csv", index=False)
print("Enriched shape:", df_enriched.shape)

Added:  
- 2024 Findex benchmarks (ACC_OWNERSHIP 49%, ACC_MM_ACCOUNT 19.4%, USG_DIGITAL_PAYMENT ~21%)  
- 2025 NBE mobile money registrations (136M)  
- Key 2025 events (Fayda push, NDPS draft)  

See `data_enrichment_log.md` for full documentation, original quotes, confidence rationale.