In [4]:
import pandas as pd

# Adjust path if necessary
df = pd.read_csv("data/raw/ethiopia_fi_unified_data.csv", dtype=str, low_memory=False)

print("Shape:", df.shape)
print("\nrecord_type distribution:")
print(df["record_type"].value_counts(dropna=False))

print("\nYears / dates coverage:")
# Coalesce known date-like columns (in priority order) to create a single 'date' column.
date_cols = ["observation_date", "event_date", "period_end", "period_start", "collection_date"]
first_date = None
for col in date_cols:
	if col in df.columns:
		if first_date is None:
			first_date = df[col]
		else:
			first_date = first_date.combine_first(df[col])
# If no candidate columns exist, create an all-NaT Series
if first_date is None:
	first_date = pd.Series([pd.NaT] * len(df), index=df.index)
df["date"] = pd.to_datetime(first_date, errors="coerce")
print(df["date"].dt.year.value_counts().sort_index())

print("\nUnique indicator_codes (top 15 most frequent):")
print(df["indicator_code"].value_counts().head(15))

print("\nEvents only:")
# Use the unified 'date' column (avoids KeyError if 'event_date' is missing)
# reindex will create missing columns (e.g. 'description') as NaN instead of raising KeyError
events = df[df["record_type"] == "event"].reindex(columns=["date", "category", "description", "source_name"])
print(events.sort_values("date").to_string(index=False))

print("\nColumns:", df.columns.tolist())

Shape: (43, 34)

record_type distribution:
record_type
observation    30
event          10
target          3
Name: count, dtype: int64

Years / dates coverage:
date
2014     1
2017     1
2021     7
2022     1
2023     2
2024    14
2025    15
2028     1
2030     1
Name: count, dtype: int64

Unique indicator_codes (top 15 most frequent):
indicator_code
ACC_OWNERSHIP         7
ACC_FAYDA             4
ACC_MM_ACCOUNT        2
ACC_4G_COV            2
USG_P2P_COUNT         2
GEN_GAP_ACC           2
GEN_MM_SHARE          2
ACC_MOBILE_PEN        1
USG_P2P_VALUE         1
USG_ATM_COUNT         1
USG_ATM_VALUE         1
USG_CROSSOVER         1
USG_TELEBIRR_USERS    1
USG_TELEBIRR_VALUE    1
USG_MPESA_USERS       1
Name: count, dtype: int64

Events only:
      date       category  description   source_name
2021-05-17 product_launch          NaN Ethio Telecom
2021-09-01         policy          NaN           NBE
2022-08-01   market_entry          NaN          News
2023-08-01 product_launch          