## Merge Cleaned Datasets and Deep Cleaning

1. Load cleaned datasets (`IRENA`, `OWID CO₂`, `WGI`)
2. Inspect each dataset for types, missing values, and column consistency
3. Merge datasets step by step using `ISO3` and `year`
4. Handle missing values (fill, drop, or keep as NA)
5. Generate a final combined dataset ready for EDA and visualizations
6. Save in /data/final folder

In [None]:
import pandas as pd
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import skew # type: ignore
from IPython.display import display, HTML
import warnings

# Suppress unnecessary warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
## Project setup: configure paths and imports for accessing modules and data files
import sys
from pathlib import Path

# Set project root
project_root = Path.cwd().parent

# Add project_root and project_scripts to sys.path for imports
project_scripts = project_root / "project_scripts"
for p in [project_root, project_scripts]:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))

In [None]:
# Project Setup and Imports 
from project_scripts import project_path_setup
from project_scripts.data_handler import DataHandler

# Project paths (from project_path_setup.py)
project_root = project_path_setup.project_root
project_scripts = project_path_setup.project_scripts

# Data directories (relative paths from project_root)
raw_dir = project_root / "data" / "raw"
clean_dir = project_root / "data" / "cleaned"
final_dir = project_root / "data" / "final"
sqlite_dir = project_root / "data" / "sqlite"

# Ensure directories exist
for d in [clean_dir, final_dir, sqlite_dir]:
    d.mkdir(parents=True, exist_ok=True)


In [None]:
#1. Load datasets
# #Initialize dataframes to avoid warnings
irena: pd.DataFrame = None
# owid: pd.DataFrame = pd.DataFrame()
# wgi: pd.DataFrame = pd.DataFrame()
# Load cleaned IRENA
irena = pd.read_csv(clean_dir / "irena_countries.csv")
print("IRENA shape:", irena.shape)
irena.head(2)

# Load cleaned OWID
owid = pd.read_csv(clean_dir / "owid_countries.csv")
print("OWID shape:", owid.shape)
owid.head(2)

# Load cleaned WGI
wgi = pd.read_csv(clean_dir / "wgi_countries.csv")
print("WGI shape:", wgi.shape)
wgi.head(2)

## Check key columns for merging
We will merge datasets on:
- `country_iso` → standardized ISO3 code
- `year` → numeric year

Check if these columns exist and have no missing values

In [None]:
#2. Inspect datasets
#list view of column names
print("\nIRENA columns:\n", irena.columns.tolist()) # type: ignore
print("\nOWID columns:\n", owid.columns.tolist()) # type: ignore
print("\nWGI columns:\n", wgi.columns.tolist()) # type: ignore


IRENA columns:
 ['region', 'sub-region', 'country', 'iso3_code', 'm49_code', 're_or_non-re', 'group_technology', 'technology', 'sub-technology', 'producer_type', 'year', 'electricity_generation_(gwh)', 'electricity_installed_capacity_(mw)', 'heat_generation_(tj)', 'public_flows_(2022_usd_m)', 'sdg_7a1_intl._public_flows_(2022_usd_m)', 'sdg_7b1_re_capacity_per_capita_(w/inhabitant)']

OWID columns:
 ['country', 'year', 'iso_code', 'population', 'gdp', 'cement_co2', 'cement_co2_per_capita', 'co2', 'co2_growth_abs', 'co2_growth_prct', 'co2_including_luc', 'co2_including_luc_growth_abs', 'co2_including_luc_growth_prct', 'co2_including_luc_per_capita', 'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy', 'co2_per_capita', 'co2_per_gdp', 'co2_per_unit_energy', 'coal_co2', 'coal_co2_per_capita', 'consumption_co2', 'consumption_co2_per_capita', 'consumption_co2_per_gdp', 'cumulative_cement_co2', 'cumulative_co2', 'cumulative_co2_including_luc', 'cumulative_coal_co2', 'cumulative_

In [None]:
# Check for missing ISO3 codes and years
# Check IRENA
print(irena[['iso3_code','year']].isna().sum())
# Check OWID
print(owid[['iso_code','year']].isna().sum())
# Check WGI
print(wgi[['country_iso','year']].isna().sum())

datasets are clean, aligned, and ready to be merged using ISO code + year

In [None]:
## 3. Merge

# 1. Standardize ISO column names in the 3 datasets
# 2. Merge `IRENA` + `OWID` on `country_iso` and `year` (left join)
# 3. Merge the result with `WGI` (left join)
# 4. Keep all IRENA rows; missing OWID or WGI data will be NaN
# Merge IRENA + OWID

# Make ISO column consistent across datasets
irena = irena.rename(columns={'iso3_code': 'iso'})
owid  = owid.rename(columns={'iso_code': 'iso'})
wgi   = wgi.rename(columns={'country_iso': 'iso'})
# Merge IRENA + OWID
irena_owid = pd.merge(
    irena,
    owid,
    on=['iso','year'],
    how='left',
    suffixes=('_irena','_owid')
)
print("IRENA + OWID shape:", irena_owid.shape)

# Merge with WGI
final_df = pd.merge(
    irena_owid,
    wgi,
    on=['iso','year'],
    how='left',
    suffixes=('','_wgi')
)
print("Final merged shape:", final_df.shape)
#Save final dataset
final_df.to_csv("../data/final/final_countries.csv", index=False)
final_df.head(3)

In [None]:
# 4. Check Data Types
# Show all columns without truncation
# pd.set_option('display.max_columns', None)
# final_df.dtypes
print(final_df.dtypes.to_frame('dtype').to_string())

Type Conversion Summary
1.Many numeric columns (e.g., population, gdp, co2) are stored as object → need conversion to numeric for analysis.
2.year is currently int64 → as is or converted to datetime for time-series operations.
3.Columns like region, country names, and ISO codes are object → convert to category to save memory.

Actions Taken
1.Convert numeric columns stored as object → float64/int64.
2.Convert year →  convert to datetime.
3.Convert categorical columns (region, iso, country names, etc.) → category.

Benefits
1.Enables proper numeric calculations and aggregations.
2.Optimizes memory usage.
3.Prepares dataset for time-series and categorical analyses.

In [None]:
# 1.Convert numeric columns stored as object to numeric
# List numeric columns that may be stored as object
numeric_cols = [
    'population', 'gdp', 'cement_co2', 'co2', 'co2_per_capita', 'coal_co2', 'oil_co2',
    'gas_co2', 'methane', 'nitrous_oxide', 'primary_energy_consumption',
    'electricity_generation_(gwh)', 'electricity_installed_capacity_(mw)',
    'heat_generation_(tj)', 'public_flows_(2022_usd_m)',
    'sdg_7a1_intl._public_flows_(2022_usd_m)',
    'sdg_7b1_re_capacity_per_capita_(w/inhabitant)',
    'temperature_change_from_co2', 'temperature_change_from_ch4', 
    'temperature_change_from_ghg', 'total_ghg', 'total_ghg_excluding_lucf',
    'scalemean', 'scalesd'
]

# 2.Convert to numeric, coercing errors to NaN
for col in numeric_cols:
    if col in final_df.columns:
        final_df[col] = pd.to_numeric(final_df[col], errors='coerce')
        

#Convert year column to datetime

# Or convert to datetime for time-series operations
final_df['year'] = pd.to_datetime(final_df['year'], format='%Y')

#3.Convert ISO codes, country names, and categorical labels to category
cat_cols = [
    'region', 'sub-region', 'country_irena', 'iso', 're_or_non-re', 
    'group_technology', 'technology', 'sub-technology', 'producer_type',
    'country_owid', 'country_iso'
]

for col in cat_cols:
    if col in final_df.columns:
        final_df[col] = final_df[col].astype('category')

print(final_df.dtypes.to_frame('dtype').to_string())

## 5. Check Missing Values

- Some countries or years may be missing OWID or WGI data
- Decide on imputation or leave as NA
- For numeric columns we can fill with 0 or median (depends on context)

In [None]:
# Missing values count
missing_counts = final_df.isna().sum().sort_values(ascending=False)
missing_counts.head(20)

In [None]:
# Unique ISO codes in each dataset - ISO overlap
irena_isos = set(irena['iso'].unique())
owid_isos  = set(owid['iso'].unique())
wgi_isos   = set(wgi['iso'].unique())

# Countries in IRENA but missing in OWID
missing_in_owid = irena_isos - owid_isos
print("ISOs in IRENA but missing in OWID:", missing_in_owid)

# Countries in IRENA but missing in WGI
missing_in_wgi = irena_isos - wgi_isos
print("ISOs in IRENA but missing in WGI:", missing_in_wgi)

# Countries present in all three
common_isos = irena_isos & owid_isos & wgi_isos
print("Countries present in all three datasets:", len(common_isos))


In [None]:
#year coverage
print("IRENA years:", sorted(irena['year'].unique()))
print("OWID years:", sorted(owid['year'].unique()))
print("WGI years:", sorted(wgi['year'].unique()))
#summary table
summary = pd.DataFrame({
    'dataset': ['IRENA','OWID','WGI'],
    'num_rows': [len(irena), len(owid), len(wgi)],
    'num_iso': [len(irena_isos), len(owid_isos), len(wgi_isos)]
})
display(summary)

#### 1 ISO standardization check
* `Countries present in all three datasets: 178` 
* There are many ISOs in IRENA missing in OWID and WGI (like `MTQ`, `XLA`, `PRI` …).
* This is expected because some codes might be Non-sovereign territories (e.g., `GLP`, `MYT`), Deprecated codes or placeholders (`XLA`, `XOC`) and Countries not present in all datasets
#### 2 Year coverage
* IRENA: 2000–2024
* OWID: 1750–2024 (huge historical coverage)
* WGI: 1996–2023
* This mismatch in years explains why many columns in the final merged dataset have **NaNs**.
#### 3 Summary table
* IRENA: 235 ISOs
* OWID: 218 ISOs
* WGI: 189 ISOs
* Only a subset of countries overlap fully → final merged dataset will have missing values for some countries and years.


In [None]:
#Analysing heat_generation_(tj) column
final_df['heat_generation_(tj)'].describe()

missing_by_country = final_df[final_df['heat_generation_(tj)'].isna()].groupby('iso').size().sort_values(ascending=False)
print(missing_by_country)

# Calculate median heat generation per country
median_by_country = final_df.groupby('iso')['heat_generation_(tj)'].median()

# View top 10
print(median_by_country.sort_values(ascending=False).head(10))

# Select the column
heat = final_df['heat_generation_(tj)']

# Drop missing values temporarily for analysis
heat_nonan = heat.dropna()

# Calculate basic stats
mean_val = heat_nonan.mean()
median_val = heat_nonan.median()
std_val = heat_nonan.std()
skewness = skew(heat_nonan)

print(f"Mean: {mean_val:.2f}")
print(f"Median: {median_val:.2f}")
print(f"Std Dev: {std_val:.2f}")
print(f"Skewness: {skewness:.2f}")

# Group by country
country_stats = final_df.groupby('iso')['heat_generation_(tj)'].agg(
    count='count',
    missing='size',
    mean='mean',
    median='median',
    std='std',
    skew=lambda x: skew(x.dropna())
).reset_index()

# Calculate missing values per country
country_stats['missing'] = final_df.groupby('iso')['heat_generation_(tj)'].apply(lambda x: x.isna().sum()).values

print(country_stats.head(10))


In [None]:
# Step 0: Make sure column name is clean
final_df.columns = final_df.columns.str.strip()

# Step 1: Check missing before
print("Missing before:", final_df['heat_generation_(tj)'].isna().sum())

# Step 2: Impute per country; if all NaN, keep as NaN
final_df['heat_generation_(tj)'] = final_df.groupby('iso')['heat_generation_(tj)'].transform(
    lambda x: x.fillna(x.median()) if not x.isna().all() else x
)

# Step 3: Check missing after
print("Missing after:", final_df['heat_generation_(tj)'].isna().sum())

# View top 10
print(median_by_country.sort_values(ascending=False))

Handling Missing Values in Energy & Socioeconomic Data
1.Energy Generation Columns
heat_generation_(TJ), electricity_generation_(GWh), electricity_installed_capacity_(MW)
Fill missing values per country using median. If all values for a country are missing, leave as NaN.

2.Population & GDP Columns
population, gdp
Fill missing values using interpolation along time for the same country. Avoid filling with 0 to prevent distortion in per-capita metrics.

3.Derived / Aggregate Columns
co2_per_capita, sdg_7b1_re_capacity_per_capita, trade_co2_share, etc.
Recalculate after filling the raw data, or fill missing using country-wise median if necessary.

In [None]:
#Code Imputation
#Part 1: Energy Generation Columns
# Ensure column names are clean
final_df.columns = final_df.columns.str.strip()

energy_cols = [
    'heat_generation_(tj)',
    'electricity_generation_(gwh)',
    'electricity_installed_capacity_(mw)'
]

# Impute missing values per country using median
for col in energy_cols:
    if col in final_df.columns:
        final_df[col] = final_df.groupby('iso')[col].transform(
            lambda x: x.fillna(x.median()) if not x.isna().all() else x
        )

# Check missing after
print("Missing values after imputing energy columns:")
print(final_df[energy_cols].isna().sum())

#Part 2: Population & GDP Columns
pop_gdp_cols = ['population', 'gdp']

# Interpolate missing values along time per country using transform
for col in pop_gdp_cols:
    if col in final_df.columns:
        final_df[col] = final_df.groupby('iso')[col].transform(
            lambda x: x.interpolate(method='linear', limit_direction='both')
        )

# Check missing after
print("Missing values after imputing population & GDP:")
print(final_df[pop_gdp_cols].isna().sum())

In [None]:
# Impute heat_generation_(tj) based on Energy generated columns
# Energy columns
installed_col = 'electricity_installed_capacity_(mw)'
generation_col = 'electricity_generation_(gwh)'
heat_col = 'heat_generation_(tj)'

# For rows where heat_generation is missing but some electricity data exists
mask = final_df[heat_col].isna() & (
    final_df[installed_col].notna() | final_df[generation_col].notna()
)

# Fill with a small fraction of electricity generation or capacity as a proxy
# Here we assume 1% of generation as a small placeholder
final_df.loc[mask, heat_col] = final_df.loc[mask, generation_col].fillna(0) * 0.01

print("Missing heat_generation_(tj) after related-column imputation:", 
      final_df[heat_col].isna().sum())

In [None]:
# Part 3: Derived / Aggregate Columns
# ==============================
derived_cols = [
    'co2_per_capita', 
    'sdg_7b1_re_capacity_per_capita_(w/inhabitant)',
    'trade_co2_share',
    'cumulative_other_co2'
]

print("Missing values before imputing derived columns:")
print(final_df[derived_cols].isna().sum())

for col in derived_cols:
    if col in final_df.columns:
        #Impute per country median if all values not missing
        final_df[col] = final_df.groupby('iso')[col].transform(
            lambda x: x.fillna(x.median()) if not x.isna().all() else x
        )

print("Missing values after imputing derived columns:")
print(final_df[derived_cols].isna().sum())

In [None]:
## 6. Quick Descriptive Stats
display(HTML(final_df.describe(include='all').T.to_html(max_rows=None, max_cols=None)))
# Then display the transposed describe
#final_df.describe(include='all').T
#final_df.describe(include='all').T.head(10)
#save to csv file
final_df.describe(include='all').T.to_csv('describe_summary_final_merged_df.csv')

In [None]:
## 6. Save final merged dataset
final_path = final_dir / "final_combined.csv"
final_df.to_csv(final_path, index=False)
print("Saved final dataset:", final_path)