In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import eurostat
import matplotlib.pyplot as plt
import plotly.express as px
import pycountry
import pickle
import networkx as nx
import os

plt.style.use("ggplot")

%load_ext jupyter_black

# Explore dataset

In [2]:
df_dataset = eurostat.get_toc_df()
df_dataset.head()

Unnamed: 0,title,code,type,last update of data,last table structure change,data start,data end
0,Gross weight of goods transported to/from main...,MAR_GO_AM_TR,dataset,2011-04-11T23:00:00+0200,2023-07-27T23:00:00+0200,2008,2022
1,Gross weight of goods transported to/from main...,MAR_GO_AM_UK,dataset,2010-04-22T11:00:00+0200,2023-11-22T11:00:00+0100,2000,2019
2,Gross weight of goods transported to/from main...,MAR_GO_QM,dataset,2024-05-03T23:00:00+0200,2024-03-26T23:00:00+0100,1997-Q1,2023-Q4
3,Gross weight of goods handled in main ports by...,MAR_GO_QMC,dataset,2024-05-03T23:00:00+0200,2024-03-26T23:00:00+0100,1997-Q1,2023-Q4
4,Gross weight of goods transported to/from main...,MAR_GO_QM_BE,dataset,2024-03-26T23:00:00+0100,2024-03-05T23:00:00+0100,1997-Q1,2023-Q3


In [3]:
BOLD = "\033[1m"
END = "\033[0m"


def get_dataset_info(dataset_code):
    """Functions returning dataset info"""
    parameters = eurostat.get_pars(dataset_code)
    print(BOLD + "Parameters are:" + END)
    print(parameters)
    print()
    print(BOLD + "Parameters corresponds to:" + END)
    dic = eurostat.get_dic(dataset_code)
    display(dic)
    print()
    print(BOLD + "Values of parameters are:" + END)

    for parameter in parameters:
        print(parameter)
        values = eurostat.get_par_values(dataset_code, parameter)
        print(values)

In [4]:
dataset_code = "NRG_BAL_C"

get_dataset_info(dataset_code)

[1mParameters are:[0m
['freq', 'nrg_bal', 'siec', 'unit', 'geo']

[1mParameters corresponds to:[0m


[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('nrg_bal',
  'Energy balance',
  'This code list refers to the flows used for the energy balances.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
nrg_bal
['PPRD', 'RCV_RCY', 'IMP', 'EXP', 'STK_CHG', 'GAE', 'INTMARB', 'GIC', 'INTAVI', 'NRGSUP', 'GIC2020-2030', 'PEC2020-2030', 'FEC2020-2030', 'TI_E', 'TI_EHG_E', 'TI_EHG_MAPE_E', 'TI_EHG_MAPCHP_E', 'TI_EHG_MAPH_E', 'TI_EHG_APE_E', 'TI_EHG_APCHP_E', 'TI_EHG_APH_E', 'TI_EHG_EDHP', 'TI_EHG_EB', 'TI_EHG_EPS', 'TI_EHG_DHEP', 'TI_EHG_CB', 'TI_CO_E', 'TI_BF_E', 'TI_GW_E', 'TI_RPI_E', 'TI_RPI_RI_E', 'TI_RPI_BPI_E', 'TI_RPI_PT_E', 'TI_RPI_IT_E', 'TI_RPI_DU_E', 'TI_RPI_PII_E', 'TI_PF_E', 'TI_BKBPB_E', 'TI_CL_E', 'TI_BNG_E', 'TI_LBB_E', 'TI_CPP_E', 'TI_GTL_E', 'TI_NSP_E', 'TO', 'TO_EHG', 'TO_EHG_MAPE', 'TO_EHG_MAPCHP', 'TO_EHG_MAPH', 'TO_EHG_APE', 'TO_EHG_APCHP', 'TO_EHG_APH', 'TO_EHG_EDHP', 'TO_EHG_EB', 'TO_EHG_PH', 'TO_EHG_OTH', 'TO_CO', 'TO_BF', 'TO_GW', 'TO_RPI', 'TO_RPI_RO', 'TO_RPI_BKFLOW', 'TO_RPI_PT', 'TO_RPI_IT', 'TO_RPI_PPR', 'TO_RPI_PIR', 'TO_PF', 'TO_BKBPB', 'TO_CL', 'TO_BNG', 'TO_LBB', 'TO_CPP', 'TO_GTL', 'TO_NSP', 'NRG_E', 'NRG_EHG_E

In [5]:
list_dataset_code = [
    "NRG_CB_RW",
    "NRG_IND_URTD",
    "NRG_INF_LBPC",
    "NRG_BAL_C",
]

In [6]:
for dataset_code in list_dataset_code:
    print("____________________")
    print(dataset_code)
    print()

    get_dataset_info(dataset_code)

____________________
NRG_CB_RW

[1mParameters are:[0m
['freq', 'nrg_bal', 'siec', 'unit', 'geo']

[1mParameters corresponds to:[0m


[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('nrg_bal',
  'Energy balance',
  'This code list refers to the flows used for the energy balances.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
nrg_bal
['IPRD', 'IMP', 'EXP', 'STK_CHG', 'INTMARB', 'IC_CAL', 'INTAVI_E', 'INTAVI_NE', 'TI_E', 'TI_EHG_MAPE_E', 'TI_EHG_MAPCHP_E', 'TI_EHG_MAPH_E', 'TI_EHG_APE_E', 'TI_EHG_APCHP_E', 'TI_EHG_APH_E', 'TI_BF_E', 'TI_GW_E', 'TI_PF_E', 'TI_BKBPB_E', 'TI_BNG_E', 'TI_LBB_E', 'TI_CPP_E', 'TI_GTL', 'TI_STG', 'TI_STL', 'TI_LTG', 'TI_LTL', 'TI_NSP_E', 'NRG_E', 'NRG_EHG_E', 'NRG_CM_E', 'NRG_CM_NE', 'NRG_OIL_NG_NE', 'NRG_PF_E', 'NRG_CO_E', 'NRG_CO_NE', 'NRG_BKBPB_E', 'NRG_GW_E', 'NRG_GW_NE', 'NRG_BF_E', 'NRG_BF_NE', 'NRG_PR_E', 'NRG_BIOG_E', 'NRG_CPP_E', 'NRG_NSP_E', 'NRG_NSP_NE', 'DL', 'FC', 'FC_NE', 'NRG_NE', 'FC_IND_NE', 'FC_TRA_NE', 'FC_OTH_NE', 'FC_E', 'FC_IND_E', 'FC_IND_IS_E', 'FC_IND_IS_NE', 'FC_IND_CPC_E', 'FC_IND_CPC_NE', 'FC_IND_NFM_E', 'FC_IND_NFM_NE', 'FC_IND_NMM_E', 'FC_IND_NMM_NE', 'FC_IND_TE_E', 'FC_IND_TE_NE', 'FC_IND_MAC_E', 'FC_IND_MAC_NE', 'FC_IND_MQ_E', 'FC_IND_MQ_NE', 'FC_IND_FBT_E', 'FC_IND_FBT_NE', 'FC_IND_PPP_E', 'FC_IND_PPP_NE

[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('nrg_bal',
  'Energy balance',
  'This code list refers to the flows used for the energy balances.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
siec
['TOTAL', 'RA000', 'R5200', 'R5250S', 'R5251S', 'R5252S', 'R5253S', 'R5254S', 'R5255S', 'R5256S', 'R5257S', 'R5258S', 'R5259S', 'R5260S', 'R5261S', 'R5262S', 'R5263S', 'R5264S', 'R5265S', 'R5266S', 'R5267S', 'R5268S', 'R5269S', 'R5270S', 'R5271S', 'R5272S', 'R5273S', 'R5274S', 'R5275S', 'R5276S', 'R5277S', 'R5278S', 'R5279S', 'R5300', 'R5300B', 'E7000', 'E7100', 'E7200', 'RHYD', 'RSYNF']
nrg_bal
['FC_TRA_E_RED', 'FC_TRA_ROAD_E_RED', 'FC_TRA_RAIL_E_RED', 'FC_TRA_OTH_E_RED', 'NRG_MNBRFT_E', 'GFC_TRA_E_NMULTI', 'GFC_TRA_E_MULTI', 'LIM_TT', 'BIOFCON_LIM_TRA', 'BIOF_FS_SHR', 'ST_MS_TRA', 'BIOG_G_SHR', 'BIOG_G_TRA', 'BIOG_G_ADJSHR']
unit
['KTOE', 'TJ_GCV', 'PC']
geo
['EU27_2020', 'EA20', 'BE', 'BG', 'CZ', 'DK', 'DE', 'EE', 'IE', 'EL', 'ES', 'FR', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'NL', 'AT', 'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'IS', 'NO', 'UK', 'BA', 'ME', 'MD', 'MK', 'GE', 'AL', 'RS', 'TR', 'XK']
____________________
NRG

[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('plant_tec',
  'Technical characteristics of plants',
  'This code list refers to the technical characteristics of a plant, such as capacity or production specificities.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
siec
['R5210P', 'R5220P', 'R5230P', 'R5290']
plant_tec
['CAP_PRD']
unit
['THS_TY']
geo
['EU27_2020', 'EA20', 'BE', 'BG', 'CZ', 'DK', 'DE', 'EE', 'IE', 'EL', 'ES', 'FR', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'NL', 'AT', 'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'IS', 'LI', 'NO', 'UK', 'BA', 'ME', 'MD', 'MK', 'GE', 'AL', 'RS', 'TR', 'UA', 'XK']
____________________
NRG_BAL_C

[1mParameters are:[0m
['freq', 'nrg_bal', 'siec', 'unit', 'geo']

[1mParameters corresponds to:[0m


[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('nrg_bal',
  'Energy balance',
  'This code list refers to the flows used for the energy balances.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
nrg_bal
['PPRD', 'RCV_RCY', 'IMP', 'EXP', 'STK_CHG', 'GAE', 'INTMARB', 'GIC', 'INTAVI', 'NRGSUP', 'GIC2020-2030', 'PEC2020-2030', 'FEC2020-2030', 'TI_E', 'TI_EHG_E', 'TI_EHG_MAPE_E', 'TI_EHG_MAPCHP_E', 'TI_EHG_MAPH_E', 'TI_EHG_APE_E', 'TI_EHG_APCHP_E', 'TI_EHG_APH_E', 'TI_EHG_EDHP', 'TI_EHG_EB', 'TI_EHG_EPS', 'TI_EHG_DHEP', 'TI_EHG_CB', 'TI_CO_E', 'TI_BF_E', 'TI_GW_E', 'TI_RPI_E', 'TI_RPI_RI_E', 'TI_RPI_BPI_E', 'TI_RPI_PT_E', 'TI_RPI_IT_E', 'TI_RPI_DU_E', 'TI_RPI_PII_E', 'TI_PF_E', 'TI_BKBPB_E', 'TI_CL_E', 'TI_BNG_E', 'TI_LBB_E', 'TI_CPP_E', 'TI_GTL_E', 'TI_NSP_E', 'TO', 'TO_EHG', 'TO_EHG_MAPE', 'TO_EHG_MAPCHP', 'TO_EHG_MAPH', 'TO_EHG_APE', 'TO_EHG_APCHP', 'TO_EHG_APH', 'TO_EHG_EDHP', 'TO_EHG_EB', 'TO_EHG_PH', 'TO_EHG_OTH', 'TO_CO', 'TO_BF', 'TO_GW', 'TO_RPI', 'TO_RPI_RO', 'TO_RPI_BKFLOW', 'TO_RPI_PT', 'TO_RPI_IT', 'TO_RPI_PPR', 'TO_RPI_PIR', 'TO_PF', 'TO_BKBPB', 'TO_CL', 'TO_BNG', 'TO_LBB', 'TO_CPP', 'TO_GTL', 'TO_NSP', 'NRG_E', 'NRG_EHG_E

In [44]:
def get_filter_param(dataset_code, freq_prefered="A", unit_prefered="TJ", **kwargs):
    """Return a filter dict for freq and unit used to avoid downloading useless data"""
    my_filter_pars = {}
    freqs = eurostat.get_par_values(dataset_code, "freq")
    freq_prefered = "A"
    if freq_prefered in freqs:
        my_filter_pars["freq"] = freq_prefered

    units = eurostat.get_par_values(dataset_code, "unit")
    unit_prefered = "TJ"
    if unit_prefered in units:
        my_filter_pars["unit"] = unit_prefered
    else:
        my_filter_pars["unit"] = units[0]
    for a in kwargs:
        my_filter_pars[a] = kwargs[a]
    return my_filter_pars

In [45]:
dataset_code = list_dataset_code[0]
print(dataset_code)

NRG_CB_RW


In [46]:
def create_csv(dataset_code, **kwargs):
    filter_pars = get_filter_param(
        dataset_code, freq_prefered="A", unit_prefered="TJ", **kwargs
    )

    csv_file = f"data/{dataset_code}.csv"

    # Download the data
    df = eurostat.get_data_df(dataset_code, filter_pars=filter_pars)
    # Rename siec
    for col in ["siec", "nrg_bal"]:
        if col in df.columns:
            dict_col = eurostat.get_dic(dataset_code, col, frmt="dict")
            df[col] = df[col].replace(dict_col)
    df = df.rename(columns={"geo\TIME_PERIOD": "Country"})
    df.to_csv(csv_file)

In [53]:
def create_nc(dataset_code, **kwargs):
    csv_file = f"data/{dataset_code}.csv"
    nc_file = f"data/{dataset_code}.nc"
    create_csv(dataset_code, **kwargs)
    df = pd.read_csv(csv_file, index_col=0)
    col_for_index = list(df.select_dtypes(object).columns)
    col_for_values = list(df.select_dtypes(float).columns)

    df = df.set_index(col_for_index)[col_for_values]
    df.columns = pd.to_datetime(df.columns)
    df.columns.name = "Year"
    ds = df.stack().to_xarray()
    ds = ds.rename({"siec": "Energy"})
    ds = ds.squeeze()
    ds.to_netcdf(nc_file)

In [54]:
for dataset_code in list_dataset_code:
    print(dataset_code)
    if dataset_code != "NRG_BAL_C":
        create_nc(dataset_code)

NRG_CB_RW
NRG_IND_URTD
NRG_INF_LBPC
NRG_BAL_C


# NRG_BAL_C

In [55]:
dataset_code = "NRG_BAL_C"

In [49]:
get_dataset_info(dataset_code)

[1mParameters are:[0m
['freq', 'nrg_bal', 'siec', 'unit', 'geo']

[1mParameters corresponds to:[0m


[('freq',
  'Time frequency',
  'This code list contains the periodicity that refers to the frequency.'),
 ('nrg_bal',
  'Energy balance',
  'This code list refers to the flows used for the energy balances.'),
 ('siec',
  'Standard international energy product classification (SIEC)',
  'This code list contains the energy products according to the Standard International Energy Product Classification (SIEC) which has been developped as part of the International Recommendations for Energy Statistics (IRES) adopted by the UNSD.'),
 ('unit', 'Unit of measure', None),
 ('geo',
  'Geopolitical entity (reporting)',
  'This code list defines the reporting geopolitical entities.')]


[1mValues of parameters are:[0m
freq
['A']
nrg_bal
['PPRD', 'RCV_RCY', 'IMP', 'EXP', 'STK_CHG', 'GAE', 'INTMARB', 'GIC', 'INTAVI', 'NRGSUP', 'GIC2020-2030', 'PEC2020-2030', 'FEC2020-2030', 'TI_E', 'TI_EHG_E', 'TI_EHG_MAPE_E', 'TI_EHG_MAPCHP_E', 'TI_EHG_MAPH_E', 'TI_EHG_APE_E', 'TI_EHG_APCHP_E', 'TI_EHG_APH_E', 'TI_EHG_EDHP', 'TI_EHG_EB', 'TI_EHG_EPS', 'TI_EHG_DHEP', 'TI_EHG_CB', 'TI_CO_E', 'TI_BF_E', 'TI_GW_E', 'TI_RPI_E', 'TI_RPI_RI_E', 'TI_RPI_BPI_E', 'TI_RPI_PT_E', 'TI_RPI_IT_E', 'TI_RPI_DU_E', 'TI_RPI_PII_E', 'TI_PF_E', 'TI_BKBPB_E', 'TI_CL_E', 'TI_BNG_E', 'TI_LBB_E', 'TI_CPP_E', 'TI_GTL_E', 'TI_NSP_E', 'TO', 'TO_EHG', 'TO_EHG_MAPE', 'TO_EHG_MAPCHP', 'TO_EHG_MAPH', 'TO_EHG_APE', 'TO_EHG_APCHP', 'TO_EHG_APH', 'TO_EHG_EDHP', 'TO_EHG_EB', 'TO_EHG_PH', 'TO_EHG_OTH', 'TO_CO', 'TO_BF', 'TO_GW', 'TO_RPI', 'TO_RPI_RO', 'TO_RPI_BKFLOW', 'TO_RPI_PT', 'TO_RPI_IT', 'TO_RPI_PPR', 'TO_RPI_PIR', 'TO_PF', 'TO_BKBPB', 'TO_CL', 'TO_BNG', 'TO_LBB', 'TO_CPP', 'TO_GTL', 'TO_NSP', 'NRG_E', 'NRG_EHG_E

In [60]:
create_nc(
    dataset_code,
    nrg_bal=[
        "GEP",
        "GHP",
        "FC_TRA_E",
    ],
)

In [51]:
dict_nrg_bal = eurostat.get_dic(dataset_code, "nrg_bal")
dict_nrg_bal = {key: value for (key, value) in dict_nrg_bal}

In [52]:
dict_nrg_bal

{'TOTAL': 'Total',
 'PRD': 'Production',
 'PPRD': 'Primary production',
 'PPRD_RED': 'Primary production - Renewable Energy Directive',
 'IPRD': 'Indigenous production',
 'IPRD_AG': 'Indigenous production - associated gas',
 'IPRD_NAG': 'Indigenous production - non-associated gas',
 'IPRD_CG': 'Indigenous production - colliery gas',
 'IPRD_SB': 'Indigenous production - solid biofuels',
 'IPRD_LB': 'Indigenous production - liquid biofuels',
 'IPRD_GB': 'Indigenous production - gaseous biofuels',
 'UPRD': 'Underground production',
 'SPRD': 'Surface production',
 'NPRD': 'Net production',
 'PRD_OTH': 'Other production',
 'TOS': 'Transfer from other sources',
 'TOS_OIL': 'Transfer from other sources - oil',
 'TOS_COAL': 'Transfer from other sources - coal',
 'TOS_REN': 'Transfer from other sources - renewables',
 'TOS_NGAS': 'Transfer from other sources - natural gas',
 'IDCO': 'Into direct carry-over',
 'FDCO': 'From direct carry-over',
 'RCV_RCY': 'Recovered and recycled products',
 'IMP