In [2]:
import pandas as pd

# Load dataset
file_path = "../data/raw/infrastructure/per-capita-electricity-generation.csv"
df = pd.read_csv(file_path)

# Preview
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6831 entries, 0 to 6830
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Entity                        6831 non-null   object 
 1   Code                          6418 non-null   object 
 2   Year                          6831 non-null   int64  
 3   Per capita electricity - kWh  6831 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 213.6+ KB


(        Entity Code  Year  Per capita electricity - kWh
 0  Afghanistan  AFG  2000                     23.844612
 1  Afghanistan  AFG  2001                     34.016453
 2  Afghanistan  AFG  2002                     33.211520
 3  Afghanistan  AFG  2003                     40.029820
 4  Afghanistan  AFG  2004                     33.530476,
 None)

In [4]:
# Rename columns
df = df.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    df.columns[3]: "electricity_gen_per_capita"
})

df.head()


Unnamed: 0,country,iso_code,year,electricity_gen_per_capita
0,Afghanistan,AFG,2000,23.844612
1,Afghanistan,AFG,2001,34.016453
2,Afghanistan,AFG,2002,33.21152
3,Afghanistan,AFG,2003,40.02982
4,Afghanistan,AFG,2004,33.530476


In [5]:
# Missing values
print("Missing values (%):")
print(df.isna().mean() * 100)

# Summary statistics
print("\nSummary statistics:")
print(df.describe())

# Detect unusual/extreme values
unusual = df[(df["electricity_gen_per_capita"] < 0) | 
             (df["electricity_gen_per_capita"] > df["electricity_gen_per_capita"].quantile(0.999))]
unusual if len(unusual) > 0 else "No unusual values found"


Missing values (%):
country                       0.000000
iso_code                      6.045967
year                          0.000000
electricity_gen_per_capita    0.000000
dtype: float64

Summary statistics:
              year  electricity_gen_per_capita
count  6831.000000                 6831.000000
mean   2007.781438                 4048.852225
std      10.201073                 5068.229949
min    1985.000000                    0.000000
25%    2001.000000                  656.738720
50%    2009.000000                 2574.200700
75%    2016.000000                 5749.394700
max    2024.000000                56048.727000


Unnamed: 0,country,iso_code,year,electricity_gen_per_capita
2798,Iceland,ISL,2012,53887.945
2799,Iceland,ISL,2013,55543.375
2800,Iceland,ISL,2014,54348.32
2801,Iceland,ISL,2015,56048.727
2802,Iceland,ISL,2016,54146.516
2803,Iceland,ISL,2017,55510.758
2804,Iceland,ISL,2018,55801.168


In [6]:
# Find rows where iso_code is missing
missing_iso = df[df['iso_code'].isna()]

# Show unique countries without ISO codes
missing_countries = missing_iso['country'].unique()
missing_countries


array(['Africa', 'Asia', 'Europe', 'European Union (27)',
       'High-income countries', 'Low-income countries',
       'Lower-middle-income countries', 'North America', 'Oceania',
       'South America', 'Upper-middle-income countries'], dtype=object)

In [7]:
# Remove region-level entries (those without ISO codes)
rows_before = len(df)

df = df[df['iso_code'].notna()].copy()
df.reset_index(drop=True, inplace=True)

rows_after = len(df)
print("Rows before:", rows_before)
print("Rows after :", rows_after)
print("Rows removed:", rows_before - rows_after)


Rows before: 6831
Rows after : 6418
Rows removed: 413


In [8]:
output_path = "../data/interim/electricity_generation_clean.csv"
df.to_csv(output_path, index=False)
print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/electricity_generation_clean.csv
