In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/raw/infrastructure/access-to-clean-fuels-and-technologies-for-cooking.csv")

# Preview
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5148 entries, 0 to 5147
Data columns (total 4 columns):
 #   Column                                                                                                                       Non-Null Count  Dtype  
---  ------                                                                                                                       --------------  -----  
 0   Entity                                                                                                                       5148 non-null   object 
 1   Code                                                                                                                         4608 non-null   object 
 2   Year                                                                                                                         5148 non-null   int64  
 3   Proportion of population with primary reliance on clean fuels and technologies for cooking (%) - Residence area type: T

(        Entity Code  Year  \
 0  Afghanistan  AFG  2000   
 1  Afghanistan  AFG  2001   
 2  Afghanistan  AFG  2002   
 3  Afghanistan  AFG  2003   
 4  Afghanistan  AFG  2004   
 
    Proportion of population with primary reliance on clean fuels and technologies for cooking (%) - Residence area type: Total  
 0                                               6.20                                                                            
 1                                               7.10                                                                            
 2                                               8.30                                                                            
 3                                               9.35                                                                            
 4                                              10.80                                                                            ,
 None)

In [2]:
# Rename columns to clean, standard names
df = df.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    "Proportion of population with primary reliance on clean fuels and technologies for cooking (%) - Residence area type: Total":
        "clean_cooking_access"
})

df.head()


Unnamed: 0,country,iso_code,year,clean_cooking_access
0,Afghanistan,AFG,2000,6.2
1,Afghanistan,AFG,2001,7.1
2,Afghanistan,AFG,2002,8.3
3,Afghanistan,AFG,2003,9.35
4,Afghanistan,AFG,2004,10.8


In [3]:
# Check missing values (%)
missing_percentage = df.isna().mean() * 100
print("Missing values (%):")
print(missing_percentage)

# Summary of numeric columns
print("\nSummary statistics:")
print(df.describe())

# Look at unusual / impossible values (outside 0â€“100%)
unusual = df[(df["clean_cooking_access"] < 0) | (df["clean_cooking_access"] > 100)]
print("\nUnusual values found:")
print(unusual if len(unusual) > 0 else "None")


Missing values (%):
country                  0.00000
iso_code                10.48951
year                     0.00000
clean_cooking_access     0.00000
dtype: float64

Summary statistics:
              year  clean_cooking_access
count  5148.000000           5148.000000
mean   2011.301865             64.179439
std       7.126939             38.197445
min    1990.000000              0.000000
25%    2005.000000             25.800000
50%    2011.000000             83.300000
75%    2017.000000            100.000000
max    2023.000000            100.000000

Unusual values found:
None


In [4]:
# Count how many will be removed
print("Rows before:", len(df))
removed_regions = df[df["iso_code"].isna()]["country"].unique()
print("Removing these non-country aggregates:", removed_regions)

# Keep only rows with valid ISO codes
df = df[df["iso_code"].notna()].reset_index(drop=True)

print("Rows after:", len(df))
df.head()


Rows before: 5148
Removing these non-country aggregates: ['Africa (WHO)' 'Americas (WHO)' 'Australia and New Zealand (UN)'
 'Central Asia (UN)' 'Central and Southern Asia (UN)' 'Eastern Asia (UN)'
 'Eastern Asia and South-Eastern Asia (UN)' 'Eastern Mediterranean (WHO)'
 'Europe (WHO)' 'Latin America and the Caribbean (UN)'
 'Northern Africa (UN)' 'Northern Africa and Western Asia (UN)'
 'Northern America and Europe (UN)'
 'Oceania (exc. Australia and New Zealand) (UN)' 'South-East Asia (WHO)'
 'South-Eastern Asia (UN)' 'Southern Asia (UN)' 'Sub-Saharan Africa (UN)'
 'Western Asia (UN)' 'Western Pacific (WHO)']
Rows after: 4608


Unnamed: 0,country,iso_code,year,clean_cooking_access
0,Afghanistan,AFG,2000,6.2
1,Afghanistan,AFG,2001,7.1
2,Afghanistan,AFG,2002,8.3
3,Afghanistan,AFG,2003,9.35
4,Afghanistan,AFG,2004,10.8


In [5]:
# First, inspect unusual country names (optional but helpful)
unique_countries = sorted(df['country'].unique().tolist())
unique_countries[:30]  # show first 30 for preview


['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada']

In [7]:
# Save cleaned clean fuels cooking dataset
output_path = "../data/interim/clean_fuels_cooking.csv"
df.to_csv(output_path, index=False)

print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/clean_fuels_cooking.csv
