In [3]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split


In [5]:
# Path to your downloaded CSV file
csv_path = r"data/Top_Emission_Reduction_Strategies.csv"  

df = pd.read_csv(csv_path)
print("Rows:", len(df))
df.head()


Rows: 2463130


Unnamed: 0,source_id,source_name,iso3_country,original_inventory_sector,strategy_id,strategy_name,total_emissions_reduced_per_year,gas,strategy_description,difficulty_score
0,3673109.0,Sunndalsora aluminium plant,NOR,aluminum,155353,Recycled feedstock for smelting,779690.045836,co2e_100yr,Use recycled aluminum as an input to reduce th...,1.0
1,3673096.0,Fjardaal aluminium plant,ISL,aluminum,155353,Recycled feedstock for smelting,757801.521906,co2e_100yr,Use recycled aluminum as an input to reduce th...,1.000002
2,3673097.0,Grundartangi aluminium plant,ISL,aluminum,155353,Recycled feedstock for smelting,690873.182364,co2e_100yr,Use recycled aluminum as an input to reduce th...,1.000007
3,3673113.0,Karmoy aluminium plant,NOR,aluminum,155353,Recycled feedstock for smelting,526290.780939,co2e_100yr,Use recycled aluminum as an input to reduce th...,1.000024
4,3673098.0,Straumsvik aluminium plant,ISL,aluminum,155353,Recycled feedstock for smelting,431795.738978,co2e_100yr,Use recycled aluminum as an input to reduce th...,1.000039


In [7]:
print(df.columns.tolist())


['source_id', 'source_name', 'iso3_country', 'original_inventory_sector', 'strategy_id', 'strategy_name', 'total_emissions_reduced_per_year', 'gas', 'strategy_description', 'difficulty_score']


In [9]:
# Keep only necessary columns
df = df[[
    "source_name",
    "iso3_country",
    "original_inventory_sector",
    "total_emissions_reduced_per_year",
    "difficulty_score",
    "strategy_description"
]]

# Drop missing strategies
df = df.dropna(subset=["strategy_description"])

# Remove duplicates
df = df.drop_duplicates(subset=["source_name", "strategy_description"])

print("After cleaning:", len(df))
df.head()


After cleaning: 1419567


Unnamed: 0,source_name,iso3_country,original_inventory_sector,total_emissions_reduced_per_year,difficulty_score,strategy_description
0,Sunndalsora aluminium plant,NOR,aluminum,779690.045836,1.0,Use recycled aluminum as an input to reduce th...
1,Fjardaal aluminium plant,ISL,aluminum,757801.521906,1.000002,Use recycled aluminum as an input to reduce th...
2,Grundartangi aluminium plant,ISL,aluminum,690873.182364,1.000007,Use recycled aluminum as an input to reduce th...
3,Karmoy aluminium plant,NOR,aluminum,526290.780939,1.000024,Use recycled aluminum as an input to reduce th...
4,Straumsvik aluminium plant,ISL,aluminum,431795.738978,1.000039,Use recycled aluminum as an input to reduce th...


In [11]:
# Convert emission reduction from CO2-eq to million tons (Mt)
df["reduction_Mt"] = df["total_emissions_reduced_per_year"] / 1e6

# Map difficulty to qualitative levels (optional)
df["difficulty_level"] = df["difficulty_score"].apply(
    lambda x: "Short-term" if x <= 1.5 else ("Mid-term" if x <= 3 else "Long-term")
)

df.head()


Unnamed: 0,source_name,iso3_country,original_inventory_sector,total_emissions_reduced_per_year,difficulty_score,strategy_description,reduction_Mt,difficulty_level
0,Sunndalsora aluminium plant,NOR,aluminum,779690.045836,1.0,Use recycled aluminum as an input to reduce th...,0.77969,Short-term
1,Fjardaal aluminium plant,ISL,aluminum,757801.521906,1.000002,Use recycled aluminum as an input to reduce th...,0.757802,Short-term
2,Grundartangi aluminium plant,ISL,aluminum,690873.182364,1.000007,Use recycled aluminum as an input to reduce th...,0.690873,Short-term
3,Karmoy aluminium plant,NOR,aluminum,526290.780939,1.000024,Use recycled aluminum as an input to reduce th...,0.526291,Short-term
4,Straumsvik aluminium plant,ISL,aluminum,431795.738978,1.000039,Use recycled aluminum as an input to reduce th...,0.431796,Short-term


In [13]:
def build_input(row):
    return (
        f"Facility: {row['source_name']} | "
        f"Country: {row['iso3_country']} | "
        f"Sector: {row['original_inventory_sector']} | "
        f"EmissionReductionPotential(Mt): {row['reduction_Mt']:.3f} | "
        f"Difficulty: {row['difficulty_level']}"
    )

def build_output(row):
    return row["strategy_description"].strip()

df["input"] = df.apply(build_input, axis=1)
df["output"] = df.apply(build_output, axis=1)

df[["input", "output"]].head()


Unnamed: 0,input,output
0,Facility: Sunndalsora aluminium plant | Countr...,Use recycled aluminum as an input to reduce th...
1,Facility: Fjardaal aluminium plant | Country: ...,Use recycled aluminum as an input to reduce th...
2,Facility: Grundartangi aluminium plant | Count...,Use recycled aluminum as an input to reduce th...
3,Facility: Karmoy aluminium plant | Country: NO...,Use recycled aluminum as an input to reduce th...
4,Facility: Straumsvik aluminium plant | Country...,Use recycled aluminum as an input to reduce th...


In [21]:
N = 50000   # choose 1500, 3000, 5000 etc.
df_small = df.sample(n=N, random_state=42)

print("Sampled rows:", len(df_small))
df_small.head()


Sampled rows: 50000


Unnamed: 0,source_name,iso3_country,original_inventory_sector,total_emissions_reduced_per_year,difficulty_score,strategy_description,reduction_Mt,difficulty_level,input,output
134012,DEU_MatureDairyCattle_24658,DEU,enteric-fermentation-cattle-operation,30.802547,3.377383,Feed additives have been found to deliver 20% ...,3.1e-05,Long-term,Facility: DEU_MatureDairyCattle_24658 | Countr...,Feed additives have been found to deliver 20% ...
1606317,Abramut Commune,ROU,other-energy-use,158.824044,10.0,Reduce emissions by factor of 0.1 via unspecif...,0.000159,Long-term,Facility: Abramut Commune | Country: ROU | Sec...,Reduce emissions by factor of 0.1 via unspecif...
567798,USA_Wisconsin_OtherBeefCattle_22623,USA,manure-management-cattle-operation,11.671609,4.841614,"For non-dairy cattle high productivity, change...",1.2e-05,Long-term,Facility: USA_Wisconsin_OtherBeefCattle_22623 ...,"For non-dairy cattle high productivity, change..."
482021,Pamp치n Municipality,VEN,manure-applied-to-soils,3531.927593,4.431047,Injecting or incorporating manure slurry below...,0.003532,Long-term,Facility: Pamp치n Municipality | Country: VEN |...,Injecting or incorporating manure slurry below...
734982,Kasrouane District,LBN,cropland-fires,322.668169,5.410377,This practice involves the removal of crop res...,0.000323,Long-term,Facility: Kasrouane District | Country: LBN | ...,This practice involves the removal of crop res...


In [23]:
train_df, test_df = train_test_split(df_small, test_size=0.20, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.50, random_state=42)

print(len(train_df), len(val_df), len(test_df))


40000 5000 5000


In [25]:
os.makedirs("processed_data_50k", exist_ok=True)

train_df[["input","output"]].to_csv("processed_data_50k/train.csv", index=False)
val_df[["input","output"]].to_csv("processed_data_50k/val.csv", index=False)
test_df[["input","output"]].to_csv("processed_data_50k/test.csv", index=False)

print("Saved processed dataset!")


Saved processed dataset!
