# Load raw datasets and combine them

In [102]:
#imports
import pandas as pd
from pathlib import Path
import sys, os
import warnings
import importlib

import scripts.data_handler
importlib.reload(scripts.data_handler)
from scripts.data_handler import DataHandler


In [109]:
#Set paths
raw_dir = Path("../data/raw")
clean_dir = Path("../data/cleaned")
clean_dir.mkdir(exist_ok=True)

In [90]:
importlib.reload(scripts.data_handler) #reloaded the scripts.data_handler as it was picking up wrong one and throwing error
# Convert IRENA Excel sheets to CSV 
# Create a handler (empty filepaths is fine for Excel conversion)
handler = DataHandler(filepath_list=[])
# excel_path=raw_dir / "IRENA_renewable_energy_data.xlsx"
# xls = pd.ExcelFile(excel_path)
# print("Sheets in Excel file:",xls.sheet_names)

# Convert selected sheets to CSV
handler.excel_to_csv(
    excel_path=raw_dir / "IRENA_renewable_energy_data.xlsx",
    output_dir=raw_dir,
    sheets=["Pivot", "Country", "Region ", "Global"],
    prefix="irena"
)

Called excel_to_csv with excel_path=WindowsPath('../data/raw/IRENA_renewable_energy_data.xlsx'), output_dir=WindowsPath('../data/raw'), sheets=['Pivot', 'Country', 'Region ', 'Global'], prefix='irena'
Saved ..\data\raw\irena_pivot.csv
Saved ..\data\raw\irena_country.csv
Saved ..\data\raw\irena_region_.csv
Saved ..\data\raw\irena_global.csv


In [91]:
importlib.reload(scripts.data_handler)
#Convert WGI Excel file to CSV
handler = DataHandler(filepath_list=[])
# Path to your WGI Excel file
wgi_excel_path = raw_dir / "wgi_dataset.xlsx"

# Convert all sheets to CSV (or specify sheets)
handler.excel_to_csv(
    excel_path=wgi_excel_path,
    output_dir=raw_dir,
    sheets=None,  # None = all sheets will be converted
    prefix="wgi"
)

Called excel_to_csv with excel_path=WindowsPath('../data/raw/wgi_dataset.xlsx'), output_dir=WindowsPath('../data/raw'), sheets=None, prefix='wgi'
Saved ..\data\raw\wgi_sheet1.csv


Analyse the csv data files

In [None]:
# Path to raw data folder
raw_dir = Path("data/raw/")

# List all CSV files using glob() method that searches for files in that folder matching a pattern
csv_files = list(raw_dir.glob("*.csv"))

print("Found CSV files:")
for f in csv_files:
    print("-", f.name)

handler = DataHandler(filepath_list=csv_files)

#Load and Inspect Each CSV
# -------------------------------
for csv_file in csv_files:
    df = handler.load_file(csv_file)  # load the CSV
    if df is not None:
        print(f"\nDataset: {csv_file.name}")
        print("Shape:", df.shape)
        print("Columns:", df.columns.tolist())
        print(df.head(3))  # first 3 rows
    else:
        print(f"Skipped {csv_file.name}")

Found CSV files:


In [None]:
# Path to raw data folder
raw_dir = Path("data/raw/")

# List all CSV files
raw_dir = Path("../data/raw/")
csv_files = list(raw_dir.glob("*.csv"))

print("Found CSV files:")
for f in csv_files:
    print("-", f.name)

# Initialize handler
handler = DataHandler(filepath_list=csv_files)

# Load and Inspect Each CSV
for csv_file in csv_files:
    try:
        df = handler.load_file(csv_file)  # load the CSV
        warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
        if df is not None and not df.empty:
            print(f"\nDataset: {csv_file.name}")
            print("Shape:", df.shape)
            print("Columns:", df.columns.tolist())
            #print("Column Info:", df.info())
            #print("Describe Statistics:", df.describe())
            #print(df.head(3))  # first 3 rows
        else:
            print(f"Skipped {csv_file.name} (empty or failed to load)")
    except Exception as e:
        print(f"Error loading {csv_file.name}: {e}")

# Load, Inspect, Clean & Save Raw Datasets

In this notebook we work with 3 key datasets:

1. IRENA Renewable Energy (Country Level)
2. OWID CO₂ Emissions
3. WGI World Governance Indicators

For each dataset we will:
- Load it using DataHandler
- Inspect its structure
- Clean the column names and convert data types
- Generate ISO3 country codes
- Check missing values
- Save the cleaned dataset to `data/cleaned/`

## 1. IRENA Renewable Energy Dataset (Country Level)

This dataset contains:
- renewable energy capacity  
- electricity generation  
- heat generation  
- financial flows  
- for every country and energy technology

In [110]:
irena_path = raw_dir / "irena_country.csv"

handler = DataHandler(filepath_list=[])
irena_raw = handler.load_file(irena_path)

print("IRENA Shape:", irena_raw.shape)
print("Columns:", irena_raw.columns.tolist())
irena_raw.head(3)

IRENA Shape: (91743, 17)
Columns: ['Region', 'Sub-region', 'Country', 'ISO3 code', 'M49 code', 'RE or Non-RE', 'Group Technology', 'Technology', 'Sub-Technology', 'Producer Type', 'Year', 'Electricity Generation (GWh)', 'Electricity Installed Capacity (MW)', 'Heat Generation (TJ)', 'Public Flows (2022 USD M)', 'SDG 7a1 Intl. Public Flows (2022 USD M)', 'SDG 7b1 RE capacity per capita (W/inhabitant)']


Unnamed: 0,Region,Sub-region,Country,ISO3 code,M49 code,RE or Non-RE,Group Technology,Technology,Sub-Technology,Producer Type,Year,Electricity Generation (GWh),Electricity Installed Capacity (MW),Heat Generation (TJ),Public Flows (2022 USD M),SDG 7a1 Intl. Public Flows (2022 USD M),SDG 7b1 RE capacity per capita (W/inhabitant)
0,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2000,,,,,,0.0
1,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2001,,,,,,0.0
2,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2002,,,,,,0.0


### 2. Clean IRENA Dataset
- standardize column names to lowercase_underscore
- convert the `year` column to numeric
- create ISO3 country codes
- remove duplicates

In [114]:
irena_handler = DataHandler(
    filepath_list=[],
    country_col="Country",   # original column name (case-sensitive)
    year_col="Year"
)

irena_handler.df = irena_raw.copy()

irena_clean = irena_handler.clean_data()
irena_clean.head(3)

Data cleaned: shape (91743, 17)


Unnamed: 0,region,sub-region,country,iso3_code,m49_code,re_or_non-re,group_technology,technology,sub-technology,producer_type,year,electricity_generation_(gwh),electricity_installed_capacity_(mw),heat_generation_(tj),public_flows_(2022_usd_m),sdg_7a1_intl._public_flows_(2022_usd_m),sdg_7b1_re_capacity_per_capita_(w/inhabitant)
0,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2000,,,,,,0.0
1,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2001,,,,,,0.0
2,Africa,Northern Africa,Algeria,DZA,12,Total Renewable,Bioenergy,Solid biofuels,Other primary solid biofuels n.e.s.,All types,2002,,,,,,0.0


In [116]:
### 3. Missing Values (IRENA)
print("Shape" , irena_clean.shape)
irena_clean.isna().sum().sort_values(ascending=False).head(10)

Shape (91743, 17)


sdg_7a1_intl._public_flows_(2022_usd_m)          86656
public_flows_(2022_usd_m)                        83314
heat_generation_(tj)                             80575
sdg_7b1_re_capacity_per_capita_(w/inhabitant)    59631
electricity_generation_(gwh)                     54659
electricity_installed_capacity_(mw)              53356
region                                               0
sub-region                                           0
country                                              0
sub-technology                                       0
dtype: int64

1.Funding Columns(SDG & Public) → Replace missing/NaN with 0 as No data = No funding.
2. Heat & Electricity Columns → Keep missing as NaN as Unknown energy data ≠ zero
it just means not reported

In [None]:
# Cleaning Missing Values for IRENA Country Dataset

# Funding columns → fill missing with 0
funding_cols = [
    col for col in irena_clean.columns
    if "flows" in col.lower()  # matches both public_flows and sdg_7a1 flows
]

irena_clean[funding_cols] = irena_clean[funding_cols].fillna(0)

# Heat generation, electricity, capacity → keep NaN
# leave them untouched for correct analysis

# Print summary after cleaning
print("\nMissing values AFTER cleaning:")
print(irena_clean.isna().sum().sort_values(ascending=False).head(10))

print("\nMissing values handled safely and correctly.")



Missing values AFTER cleaning:
heat_generation_(tj)                             80575
sdg_7b1_re_capacity_per_capita_(w/inhabitant)    59631
electricity_generation_(gwh)                     54659
electricity_installed_capacity_(mw)              53356
region                                               0
sub-region                                           0
country                                              0
iso3_code                                            0
m49_code                                             0
sub-technology                                       0
dtype: int64

Missing values handled safely and correctly.


In [118]:
#Save Clean IRENA
irena_clean_path = clean_dir / "cleaned_irena.csv"
irena_clean.to_csv(irena_clean_path, index=False)

print("Saved:", irena_clean_path)

Saved: ..\data\cleaned\cleaned_irena.csv


## 2. OWID CO₂ Emissions Dataset

This dataset contains:
- CO₂ emissions by sector  
- energy consumption  
- greenhouse gases  
- population & GDP  
- temperature impact  
- for all countries and years

In [None]:
#Load OWID data
owid_path = raw_dir / "owid_co2_data.csv"

handler = DataHandler(filepath_list=[])
owid_raw = handler.load_file(owid_path)

print("OWID Shape:", owid_raw.shape)
print("Columns:", owid_raw.columns.tolist())
owid_raw.head(3)

OWID Shape: (50407, 79)
Columns: ['country', 'year', 'iso_code', 'population', 'gdp', 'cement_co2', 'cement_co2_per_capita', 'co2', 'co2_growth_abs', 'co2_growth_prct', 'co2_including_luc', 'co2_including_luc_growth_abs', 'co2_including_luc_growth_prct', 'co2_including_luc_per_capita', 'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy', 'co2_per_capita', 'co2_per_gdp', 'co2_per_unit_energy', 'coal_co2', 'coal_co2_per_capita', 'consumption_co2', 'consumption_co2_per_capita', 'consumption_co2_per_gdp', 'cumulative_cement_co2', 'cumulative_co2', 'cumulative_co2_including_luc', 'cumulative_coal_co2', 'cumulative_flaring_co2', 'cumulative_gas_co2', 'cumulative_luc_co2', 'cumulative_oil_co2', 'cumulative_other_co2', 'energy_per_capita', 'energy_per_gdp', 'flaring_co2', 'flaring_co2_per_capita', 'gas_co2', 'gas_co2_per_capita', 'ghg_excluding_lucf_per_capita', 'ghg_per_capita', 'land_use_change_co2', 'land_use_change_co2_per_capita', 'methane', 'methane_per_capita', 'nitrous_oxi

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1750,AFG,2802560.0,,0.0,0.0,,,,...,,,,,,,,,,
1,Afghanistan,1751,AFG,,,0.0,,,,,...,,,,,,,,,,
2,Afghanistan,1752,AFG,,,0.0,,,,,...,,,,,,,,,,


### Clean OWID Dataset

OWID already has:`country`,`year`,`iso_code`

But we still:
- clean column names  
- convert year to numeric  
- add `country_iso` (for cross-dataset consistency)

In [121]:
owid_handler = DataHandler(
    filepath_list=[],
    country_col="country",
    year_col="year"
)

owid_handler.df = owid_raw.copy()
owid_clean = owid_handler.clean_data()

owid_clean.head(3)

Data cleaned: shape (50407, 80)


Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share,country_iso
0,Afghanistan,1750,AFG,2802560.0,,0.0,0.0,,,,...,,,,,,,,,,AFG
1,Afghanistan,1751,AFG,,,0.0,,,,,...,,,,,,,,,,AFG
2,Afghanistan,1752,AFG,,,0.0,,,,,...,,,,,,,,,,AFG


In [123]:
#### Missing Values (OWID)
print("Shape",owid_clean.shape)
owid_clean.isna().sum().sort_values(ascending=False).head(10)

Shape (50407, 80)


share_global_other_co2               48237
share_global_cumulative_other_co2    48237
other_co2_per_capita                 47748
cumulative_other_co2                 47153
other_industry_co2                   47153
consumption_co2_per_gdp              45959
consumption_co2_per_capita           45764
trade_co2_share                      45695
trade_co2                            45695
consumption_co2                      45354
dtype: int64

1.OWID dataset contains countries + regions/aggregates.
Regions/aggregates include: World, Asia, Europe, High-income, OECD, etc.

2.Keep missing values as NaN for all emissions/consumption metrics.

3.Separate datasets:
owid_countries → countries with ISO3 codes (for merging with IRENA/WGI)
owid_regions → aggregates for regional/global analyses

In [None]:
# Cleaning OWID CO2 Dataset & Separating Countries/Regions


# Separate regions/aggregates (no ISO code)
owid_regions = owid_clean[owid_clean['iso_code'].isna()].copy()

# Keep only countries (with ISO code) for merges
owid_countries = owid_clean[owid_clean['iso_code'].notna()].copy()

# Convert year to numeric (helps SQL and merges)
owid_countries['year'] = pd.to_numeric(owid_countries['year'], errors='coerce')
owid_regions['year'] = pd.to_numeric(owid_regions['year'], errors='coerce')

# Missing values: Keep NaN for all emissions, trade, consumption columns
#    Do NOT fill with 0; missing = not reported

# Quick check
print("\nOWID countries shape:", owid_countries.shape)
print("OWID regions shape:", owid_regions.shape)
print("Example regions/aggregates in owid_regions:")
print(owid_regions['country'].unique())

print("\nOWID dataset cleaned and separated successfully.")


OWID countries shape: (42480, 80)
OWID regions shape: (7927, 80)
Example regions/aggregates in owid_regions:
['Africa' 'Africa (GCP)' 'Asia' 'Asia (GCP)'
 'Asia (excl. China and India)' 'Central America (GCP)' 'Europe'
 'Europe (GCP)' 'Europe (excl. EU-27)' 'Europe (excl. EU-28)'
 'European Union (27)' 'European Union (28)' 'High-income countries'
 'International aviation' 'International shipping' 'Kosovo'
 'Kuwaiti Oil Fires' 'Kuwaiti Oil Fires (GCP)'
 'Least developed countries (Jones et al.)' 'Low-income countries'
 'Lower-middle-income countries' 'Middle East (GCP)' 'Non-OECD (GCP)'
 'North America' 'North America (GCP)' 'North America (excl. USA)'
 'OECD (GCP)' 'OECD (Jones et al.)' 'Oceania' 'Oceania (GCP)'
 'Ryukyu Islands' 'Ryukyu Islands (GCP)' 'South America'
 'South America (GCP)' 'Upper-middle-income countries' 'World']

OWID dataset cleaned and separated successfully.


In [128]:
###  Save Cleaned OWID Dataset
owid_countries_path = clean_dir / "owid_countries.csv"
owid_regions_path   = clean_dir / "owid_regions.csv"

# Save datasets
owid_countries.to_csv(owid_countries_path, index=False)
owid_regions.to_csv(owid_regions_path, index=False)

print("Saved OWID countries (with ISO3) to:", owid_countries_path)
print("Saved OWID regions/aggregates to:", owid_regions_path)

Saved OWID countries (with ISO3) to: ..\data\cleaned\owid_countries.csv
Saved OWID regions/aggregates to: ..\data\cleaned\owid_regions.csv


## 3. WGI Governance Dataset

The World Governance Indicators dataset contains:
- government effectiveness  
- rule of law  
- voice & accountability  
- regulatory quality  
- corruption indicators  
- for every country and year

- It uses `countryname` instead of `country`, which we must rename.

In [125]:
wgi_path = raw_dir / "wgi_sheet1.csv"

handler = DataHandler(filepath_list=[])
wgi_raw = handler.load_file(wgi_path)

print("WGI Shape:", wgi_raw.shape)
print("Columns:", wgi_raw.columns.tolist()[:15])
wgi_raw.head(3)

WGI Shape: (32100, 48)
Columns: ['codeindyr', 'code', 'countryname', 'year', 'indicator', 'estimate', 'stddev', 'nsource', 'pctrank', 'pctranklower', 'pctrankupper', 'adb', 'afr', 'asd', 'bps']


Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,...,rsf,tpr,vab,vdm,wbs,wcy,wjp,wmo,scalemean,scalesd
0,AFGcc1996,AFG,Afghanistan,1996,cc,-1.291704773902893,0.3405069708824157,2,4.301075458526611,0.0,...,..,..,..,0.2950838125720781,..,..,..,0.0,0.013374,0.93648
1,ALBcc1996,ALB,Albania,1996,cc,-0.8939034938812256,0.3159140348434448,3,19.35483932495117,2.6881721019744877,...,..,..,..,0.315589909591906,..,..,..,0.25,0.013374,0.93648
2,DZAcc1996,DZA,Algeria,1996,cc,-0.5667409300804138,0.262076586484909,4,33.33333206176758,16.66666603088379,...,..,..,..,0.36883168576648,..,..,..,0.25,0.013374,0.93648


In [126]:
###  Prepare WGI for Cleaning
#Rename `countryname` → `country`
wgi_fixed = wgi_raw.rename(columns={"countryname": "country"})
wgi_handler = DataHandler(
    filepath_list=[],
    country_col="country",
    year_col="year"
)

wgi_handler.df = wgi_fixed.copy()
wgi_clean = wgi_handler.clean_data()

Data cleaned: shape (32100, 49)


In [129]:
###  Missing Values (WGI)
print("Shape",wgi_clean.shape)
wgi_clean.isna().sum().sort_values(ascending=False).head(10)

Shape (32100, 49)


country_iso     3750
code               0
country            0
year               0
indicator          0
estimate           0
stddev             0
nsource            0
pctrank            0
pctranklower       0
dtype: int64

The World Governance Indicators (WGI) dataset has multiple governance indicators for each country per year.
The only missing values are usually in country_iso (ISO3 code), while all other columns like year, indicator, estimate are mostly complete.
wgi_countries.csv → Only rows with valid ISO3 codes; safe for merging with IRENA/OWID.
wgi_missing_iso.csv → Rows without ISO3 codes; may be small territories or naming mismatches.

In [130]:
# Cleaning Missing Values for WGI

# create a separate dataframe for rows missing ISO codes
wgi_missing_iso = wgi_clean[wgi_clean['country_iso'].isna()].copy()

# Keep rows with valid ISO3 codes for merging
wgi_countries = wgi_clean[wgi_clean['country_iso'].notna()].copy()

# Quick check
print("\nWGI countries with ISO3 codes:", wgi_countries.shape)
print("WGI rows missing ISO3:", wgi_missing_iso.shape)
print("Example missing ISO3 rows:")
print(wgi_missing_iso[['country', 'year', 'indicator']].head())

# Save datasets
wgi_countries_path = clean_dir / "wgi_countries.csv"
wgi_missing_iso_path = clean_dir / "wgi_missing_iso.csv"

wgi_countries.to_csv(wgi_countries_path, index=False)
wgi_missing_iso.to_csv(wgi_missing_iso_path, index=False)

print("\nSaved WGI datasets:")
print("- Countries:", wgi_countries_path)
print("- Missing ISO3:", wgi_missing_iso_path)


WGI countries with ISO3 codes: (28350, 49)
WGI rows missing ISO3: (3750, 49)
Example missing ISO3 rows:
             country  year indicator
14      Bahamas, The  1996        cc
35        Cape Verde  1996        cc
43  Congo, Dem. Rep.  1996        cc
44       Congo, Rep.  1996        cc
57  Egypt, Arab Rep.  1996        cc

Saved WGI datasets:
- Countries: ..\data\cleaned\wgi_countries.csv
- Missing ISO3: ..\data\cleaned\wgi_missing_iso.csv


In [131]:
# Paths to save cleaned WGI datasets
wgi_countries_path     = clean_dir / "wgi_countries.csv"
wgi_missing_iso_path   = clean_dir / "wgi_missing_iso.csv"

# Save datasets
wgi_countries.to_csv(wgi_countries_path, index=False)
wgi_missing_iso.to_csv(wgi_missing_iso_path, index=False)

print("Saved WGI countries (with ISO3) to:", wgi_countries_path)
print("Saved WGI rows missing ISO3 to:", wgi_missing_iso_path)

Saved WGI countries (with ISO3) to: ..\data\cleaned\wgi_countries.csv
Saved WGI rows missing ISO3 to: ..\data\cleaned\wgi_missing_iso.csv
