In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/raw/infrastructure/total-oda-for-infrastructure-by-recipient.csv")

# Preview data
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3698 entries, 0 to 3697
Data columns (total 4 columns):
 #   Column                                                                                                                                     Non-Null Count  Dtype  
---  ------                                                                                                                                     --------------  -----  
 0   Entity                                                                                                                                     3698 non-null   object 
 1   Code                                                                                                                                       3482 non-null   object 
 2   Year                                                                                                                                       3698 non-null   int64  
 3   9.a.1 - Total official flows for infrastructure, 

(        Entity Code  Year  \
 0  Afghanistan  AFG  2000   
 1  Afghanistan  AFG  2001   
 2  Afghanistan  AFG  2002   
 3  Afghanistan  AFG  2003   
 4  Afghanistan  AFG  2004   
 
    9.a.1 - Total official flows for infrastructure, by recipient countries (millions of constant 2023 United States dollars) - DC_TOF_INFRAL  
 0                                           440000.0                                                                                          
 1                                           420000.0                                                                                          
 2                                         27640000.0                                                                                          
 3                                        126350000.0                                                                                          
 4                                        533680000.0                                             

In [3]:
df = df.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    df.columns[3]: "oda_infrastructure"
})

df.head()


Unnamed: 0,country,iso_code,year,oda_infrastructure
0,Afghanistan,AFG,2000,440000.0
1,Afghanistan,AFG,2001,420000.0
2,Afghanistan,AFG,2002,27640000.0
3,Afghanistan,AFG,2003,126350000.0
4,Afghanistan,AFG,2004,533680000.0


In [5]:
# Convert to numeric (handles commas, etc.)
df["oda_infrastructure"] = pd.to_numeric(df["oda_infrastructure"], errors="coerce")

# Ensure year is integer
df["year"] = df["year"].astype(int)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3698 entries, 0 to 3697
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             3698 non-null   object 
 1   iso_code            3482 non-null   object 
 2   year                3698 non-null   int64  
 3   oda_infrastructure  3698 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 115.7+ KB


In [6]:
rows_before = len(df)

df = df[df["iso_code"].notna()].copy()
df.reset_index(drop=True, inplace=True)

print("Rows before:", rows_before)
print("Rows after :", len(df))
print("Rows removed:", rows_before - len(df))


Rows before: 3698
Rows after : 3482
Rows removed: 216


In [7]:
print("Missing values (%):")
print(df.isna().mean() * 100)

print("\nSummary statistics:")
print(df.describe())


Missing values (%):
country               0.0
iso_code              0.0
year                  0.0
oda_infrastructure    0.0
dtype: float64

Summary statistics:
              year  oda_infrastructure
count  3482.000000        3.482000e+03
mean   2011.414991        2.820057e+08
std       6.862488        6.493082e+08
min    2000.000000       -2.300000e+05
25%    2005.250000        1.103750e+07
50%    2011.000000        6.798500e+07
75%    2017.000000        2.509575e+08
max    2023.000000        1.053795e+10


In [8]:
unusual = df[
    (df["oda_infrastructure"] < 0) |
    (df["oda_infrastructure"] > df["oda_infrastructure"].quantile(0.999))
]

unusual if len(unusual) else "No unusual values found"


Unnamed: 0,country,iso_code,year,oda_infrastructure
1191,Grenada,GRD,2007,-230000.0
1370,India,IND,2018,7251460000.0
1371,India,IND,2019,7548670000.0
1374,India,IND,2022,8097910000.0
1375,India,IND,2023,10537950000.0
2305,North Korea,PRK,2010,-30000.0


In [9]:
# Count negative values
negatives = df[df["oda_infrastructure"] < 0]
negatives


Unnamed: 0,country,iso_code,year,oda_infrastructure
1191,Grenada,GRD,2007,-230000.0
2305,North Korea,PRK,2010,-30000.0


In [10]:
output_path = "../data/interim/oda_infrastructure_clean.csv"
df.to_csv(output_path, index=False)

print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/oda_infrastructure_clean.csv
