In [2]:
import pandas as pd

# Load raw dataset
df = pd.read_csv("../data/raw/infrastructure/per-capita-energy-use.csv")

# Preview data structure
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11086 entries, 0 to 11085
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Entity                                              11086 non-null  object 
 1   Code                                                10442 non-null  object 
 2   Year                                                11086 non-null  int64  
 3   Primary energy consumption per capita (kWh/person)  11086 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 346.6+ KB


(        Entity Code  Year  Primary energy consumption per capita (kWh/person)
 0  Afghanistan  AFG  1980                                          481.20862 
 1  Afghanistan  AFG  1981                                          610.63900 
 2  Afghanistan  AFG  1982                                          717.76640 
 3  Afghanistan  AFG  1983                                          905.12665 
 4  Afghanistan  AFG  1984                                          887.37100 ,
 None)

In [3]:
df = df.rename(columns={
    "Entity": "country",
    "Code": "iso_code",
    "Year": "year",
    df.columns[3]: "energy_use_per_capita"
})

df.head()


Unnamed: 0,country,iso_code,year,energy_use_per_capita
0,Afghanistan,AFG,1980,481.20862
1,Afghanistan,AFG,1981,610.639
2,Afghanistan,AFG,1982,717.7664
3,Afghanistan,AFG,1983,905.12665
4,Afghanistan,AFG,1984,887.371


In [4]:
# Convert numeric column
df["energy_use_per_capita"] = pd.to_numeric(df["energy_use_per_capita"], errors="coerce")

# Ensure year is integer
df["year"] = df["year"].astype(int)


In [5]:
rows_before = len(df)

df = df[df["iso_code"].notna()].copy()
df.reset_index(drop=True, inplace=True)

print("Rows before:", rows_before)
print("Rows after :", len(df))
print("Rows removed:", rows_before - len(df))


Rows before: 11086
Rows after : 10442
Rows removed: 644


In [6]:
print("Missing values (%):")
print(df.isna().mean() * 100)

print("\nSummary statistics:")
print(df.describe())


Missing values (%):
country                  0.0
iso_code                 0.0
year                     0.0
energy_use_per_capita    0.0
dtype: float64

Summary statistics:
               year  energy_use_per_capita
count  10442.000000           10442.000000
mean    1999.138767           25877.263552
std       15.073443           39087.795353
min     1965.000000               0.000000
25%     1988.000000            2661.509550
50%     2000.000000           12314.056000
75%     2012.000000           35315.624750
max     2024.000000          651836.560000


In [7]:
unusual = df[
    (df["energy_use_per_capita"] < 0) |
    (df["energy_use_per_capita"] > df["energy_use_per_capita"].quantile(0.999))
]

unusual if len(unusual) else "No unusual values found"


Unnamed: 0,country,iso_code,year,energy_use_per_capita
6450,Netherlands Antilles,ANT,1980,408208.56
6451,Netherlands Antilles,ANT,1981,370182.5
9921,United States Virgin Islands,VIR,2000,358436.47
9922,United States Virgin Islands,VIR,2001,519102.12
9923,United States Virgin Islands,VIR,2002,514122.53
9924,United States Virgin Islands,VIR,2003,606772.7
9925,United States Virgin Islands,VIR,2004,624022.56
9926,United States Virgin Islands,VIR,2005,611706.44
9927,United States Virgin Islands,VIR,2006,651836.56
9928,United States Virgin Islands,VIR,2007,609783.56


In [8]:
output_path = "../data/interim/energy_use_per_capita_clean.csv"
df.to_csv(output_path, index=False)

print(f"Saved cleaned dataset to: {output_path}")


Saved cleaned dataset to: ../data/interim/energy_use_per_capita_clean.csv
