In [1]:
import seaborn as sns
import pandas as pd
pd.set_option('display.max_colwidth',None)

In [2]:
from pathlib import Path

In [3]:
import eurostat

def load_df_raw_catalog():
    return eurostat.get_toc_df()

In [4]:
import re

from pandas.core.interchange.dataframe_protocol import DataFrame


def split_domain(code):
    match = re.match(r"^([A-Z]+)+(_)+(.*)",code)
    if match:
        return match.group(1), match.group(3)
    return code

def convert_to_datetime(series):
    cleaned = series.astype('str').str.replace(r'([+-]\d{2}):(\d{2})', r'\1:\2', regex=True)
    return pd.to_datetime(cleaned, errors='coerce', utc=True)

def convert_object_to_category(df:DataFrame, threshold :int=5):
    description = df[df.select_dtypes('object')].describe()
    unique_counts = description.loc['unique']
    for column in unique_counts[unique_counts < threshold].index.tolist():
        df[column] = df[column].astype('category')
    return df

# Load and explore data

In [5]:
df_ds = load_df_raw_catalog()

In [6]:
df_ds.head()

Unnamed: 0,title,code,type,last update of data,last table structure change,data start,data end
0,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_1904,dataset,2025-04-15T11:00:00+0200,2025-04-15T11:00:00+0200,,
1,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_2683,dataset,2025-04-15T11:00:00+0200,2025-04-15T11:00:00+0200,,
2,"Employment rate by sex, age groups, educational attainment level and household composition (%)",LFST_HHEREDTY,dataset,2025-04-15T11:00:00+0200,2025-04-15T11:00:00+0200,2006.0,2024.0
3,"Number of persons by sex, age groups, household composition and educational attainment level (1 000)",LFST_HHINDED,dataset,2025-04-15T11:00:00+0200,2025-04-15T11:00:00+0200,2006.0,2024.0
4,"Number of persons by sex, age groups, household composition and working status (1 000)",LFST_HHINDWS,dataset,2025-04-15T11:00:00+0200,2025-04-15T11:00:00+0200,2006.0,2024.0


In [7]:
df_ds.dtypes

title                          object
code                           object
type                           object
last update of data            object
last table structure change    object
data start                     object
data end                       object
dtype: object

In [8]:
df_ds.describe()

Unnamed: 0,title,code,type,last update of data,last table structure change,data start,data end
count,8058,8058,8058,8058,8058,7517,7517
unique,7337,8058,1,787,419,193,102
top,Individuals - internet activities,POST_CUBE1_X$POST_DTR_1,dataset,2025-04-30T11:00:00+0200,2024-01-03T23:00:00+0100,2005,2024
freq,6,1,8058,576,1524,599,1548


## Set column data types

In [9]:
df_ds['title'] = df_ds['title'].astype('str')
df_ds['code'] = df_ds['code'].astype('str')
df_ds['last update of data'] = convert_to_datetime(df_ds['last update of data'])
df_ds['last table structure change'] = convert_to_datetime(df_ds['last table structure change']) 

In [10]:
df_ds['data start'].unique()

array([None, '2006', '2021', '1995', '2008', '2003', '1999', '1992',
       '1997', '2001', '2002', '2013', '2000', '2014', '1998', '2005',
       '2019', '2004', '2018', '1997-Q1', '2001-Q1', '2000-Q1', '2003-Q1',
       '2005-Q1', '2007-Q1', '2009-Q1', '2011-Q1', '2013-Q1', '2015-Q1',
       '2017-Q1', '2019-Q1', '2021-Q1', '2023-Q1', '2004-Q1', '2002-Q1',
       '1998-Q1', '2018-Q1', '2008-Q1', '2011', '2010', '2012', '2009',
       '2020', '2023', '2024-04', '2007', '1960', '1980', '1995-Q1',
       '1993', '1985', '1990', '1995-Q4', '1996', '2015', '1994',
       '1996-Q1', '2014-Q1', '1977', '1990-01', '2021-01', '2022',
       '2008-01', '1950', '2019-S2', '2022-Q1', '1986', '1988', '1999-01',
       '2002-01', '1978', '2016', '2022-03', '2018-01', '1975', '1994-Q1',
       '2017', '1975-Q1', '1949', '1952-Q1', '1947-Q1', '2019-01',
       '2023-11', '1991', '1992-Q2', '1983-01', '1992-Q1', '1989',
       '1980-Q1', '1978-Q1', '1979', '1998-Q4', '1999-Q1', '2005-01',
       '198

In [11]:
df_ds[['start_year','start_quater']] = df_ds['data start'].str.split('-',n=1,expand=True)
df_ds[['end_year','end_quater']] = df_ds['data end'].str.split('-',n=1,expand=True)
df_ds['start_year'] = pd.to_numeric(df_ds['start_year'], errors='coerce')
df_ds['end_year'] = pd.to_numeric(df_ds['end_year'], errors='coerce')
df_ds = df_ds.drop(columns=['data start', 'data end'])

In [12]:
unique_start = df_ds['start_year'].unique()
unique_start.sort()
unique_start

array([1947., 1949., 1950., 1952., 1953., 1954., 1959., 1960., 1962.,
       1967., 1968., 1969., 1970., 1971., 1973., 1974., 1975., 1976.,
       1977., 1978., 1979., 1980., 1981., 1982., 1983., 1984., 1985.,
       1986., 1987., 1988., 1989., 1990., 1991., 1992., 1993., 1994.,
       1995., 1996., 1997., 1998., 1999., 2000., 2001., 2002., 2003.,
       2004., 2005., 2006., 2007., 2008., 2009., 2010., 2011., 2012.,
       2013., 2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021.,
       2022., 2023., 2024.,   nan])

In [13]:
unique_start = df_ds['end_year'].unique()
unique_start.sort()
unique_start

array([1983., 1984., 1991., 1995., 1996., 1997., 1998., 1999., 2000.,
       2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008., 2009.,
       2010., 2011., 2012., 2013., 2014., 2015., 2016., 2017., 2018.,
       2019., 2020., 2021., 2022., 2023., 2024., 2025., 2026., 2032.,
       2050., 2100.,   nan])

In [14]:
df_ds.describe(include = 'all')

Unnamed: 0,title,code,type,last update of data,last table structure change,start_year,start_quater,end_year,end_quater
count,8058,8058,8058,8058,8058,7517.0,723,7517.0,827
unique,7337,8058,1,,,,20,,21
top,Individuals - internet activities,POST_CUBE1_X$POST_DTR_1,dataset,,,,Q1,,Q4
freq,6,1,8058,,,,359,,304
mean,,,,2022-06-22 00:38:33.203772672+00:00,2024-08-16 08:17:21.722015232+00:00,2005.709991,,2019.14261,
min,,,,2009-03-26 10:00:00+00:00,2018-12-13 12:00:00+00:00,1947.0,,1983.0,
25%,,,,2021-10-12 09:00:00+00:00,2024-01-03 22:00:00+00:00,2000.0,,2016.0,
50%,,,,2024-12-13 10:00:00+00:00,2024-10-15 09:00:00+00:00,2007.0,,2022.0,
75%,,,,2025-04-14 21:00:00+00:00,2025-03-27 22:00:00+00:00,2014.0,,2024.0,
max,,,,2025-05-05 09:00:00+00:00,2025-05-05 09:00:00+00:00,2024.0,,2100.0,


## Dataset cleanup  and relevance filtring

In [15]:
df_ds = df_ds.drop(columns='type')

In [16]:
df_title_duplicates = df_ds[df_ds.duplicated(subset='title',keep=False)]
df_title_duplicates

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater
0,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_1904,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
1,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_2683,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
6,"Number of households by household composition, number of children and age of youngest child (1 000)",LFST_HHNHTYCH,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,2006.0,,2024.0,
7,"Number of households by household composition, number of children and age of youngest child (1 000)",LFST_HHNHTYCH$DV_2142,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
8,"Number of households by household composition, number of children and working status within households (1 000)",LFST_HHNHWHTC,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,2006.0,,2024.0,
...,...,...,...,...,...,...,...,...
7922,Employment rate by sex,SDG_08_30,2025-04-14 21:00:00+00:00,2025-04-14 21:00:00+00:00,2009.0,,2024.0,
7924,Long-term unemployment rate by sex,SDG_08_40,2025-03-13 22:00:00+00:00,2025-03-13 22:00:00+00:00,2009.0,,2024.0,
7941,Soil sealing index,SDG_11_32,2024-06-11 21:00:00+00:00,2025-01-23 22:00:00+00:00,2006.0,,2018.0,
7944,Recycling rate of municipal waste,SDG_11_60,2025-02-13 10:00:00+00:00,2025-02-13 10:00:00+00:00,2000.0,,2023.0,


It looks like datasets with duplicates titles include at least one version that has a $DV suffix followed by number.
This indicates a derived veiw (or data variation), which is not relevant for the purpose of this project. 

Therefore, I will exclude these derived view from the list of available datasets.


In [17]:
df_ds[df_ds['code'].str.contains('\\$DV', na = False)]

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater
0,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_1904,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
1,"Employment rate of adults by sex, age groups, educational attainment level, number of children and age of youngest child (%)",LFST_HHEREDCH$DV_2683,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
7,"Number of households by household composition, number of children and age of youngest child (1 000)",LFST_HHNHTYCH$DV_2142,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
9,"Number of households by household composition, number of children and working status within households (1 000)",LFST_HHNHWHTC$DV_1623,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
11,"Percentage of part-time employment of adults by sex, age groups, number of children and age of youngest child",LFST_HHPTECHI$DV_1903,2025-04-15 09:00:00+00:00,2025-04-15 09:00:00+00:00,,,,
...,...,...,...,...,...,...,...,...
7862,"Industry by employment size class (NACE Rev. 2, B-E) (2005-2020)",SBS_SC_IND_R2$DV_665,2024-03-01 22:00:00+00:00,2023-12-21 22:00:00+00:00,,,,
7864,Enterprise statistics by size class and NACE Rev. 2 activity (from 2021 onwards),SBS_SC_OVW$DV_1482,2025-04-07 21:00:00+00:00,2025-04-07 21:00:00+00:00,,,,
7865,Enterprise statistics by size class and NACE Rev. 2 activity (from 2021 onwards),SBS_SC_OVW$DV_1601,2025-04-07 21:00:00+00:00,2025-04-07 21:00:00+00:00,,,,
7866,Enterprise statistics by size class and NACE Rev. 2 activity (from 2021 onwards),SBS_SC_OVW$DV_2302,2025-04-07 21:00:00+00:00,2025-04-07 21:00:00+00:00,,,,


In [18]:
df_ds = df_ds[~df_ds['code'].str.contains('\\$DV', na = False)]

There are two types of naming:
1. {domain}_{specific area} as example LFSA = labour force survey + Annual
2. {domain}{number}, as example TIPSII40 = International investment position (tipsii) + number

In [19]:
df_ds[['domain','ds_name']] = df_ds['code'].apply(split_domain).apply(pd.Series)

In [20]:
df_ds['domain'] = df_ds['domain'].astype('category')

Duplicate dataset titles are still present, indicating that further analysis is needed to fully resolve and filter out all redundant entries.

In [21]:
df_title_duplicates = df_ds[df_ds.duplicated(subset='title',keep=False)]
df_title_duplicates.sort_values('title')

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater,domain,ds_name
5834,"Active population by sex, age and citizenship (1 000)",LFSA_AGAN,2025-04-14 21:00:00+00:00,2025-04-14 21:00:00+00:00,1995.0,,2024.0,,LFSA,AGAN
4954,"Active population by sex, age and citizenship (1 000)",LFSQ_AGAN,2025-03-25 22:00:00+00:00,2025-03-14 10:00:00+00:00,1998.0,Q1,2024.0,Q4,LFSQ,AGAN
5832,"Active population by sex, age and educational attainment level (1 000)",LFSA_AGAED,2025-04-14 21:00:00+00:00,2025-04-14 21:00:00+00:00,1983.0,,2024.0,,LFSA,AGAED
4952,"Active population by sex, age and educational attainment level (1 000)",LFSQ_AGAED,2025-03-25 22:00:00+00:00,2025-03-14 10:00:00+00:00,1998.0,Q1,2024.0,Q4,LFSQ,AGAED
5845,"Activity rates by sex, age and citizenship (%)",LFSA_ARGAN,2025-04-14 21:00:00+00:00,2025-04-14 21:00:00+00:00,1995.0,,2024.0,,LFSA,ARGAN
...,...,...,...,...,...,...,...,...,...,...
3843,"Unemployment rates by sex, age and educational attainment level (%)",LFSA_URGAED,2025-04-14 21:00:00+00:00,2025-04-14 21:00:00+00:00,1983.0,,2024.0,,LFSA,URGAED
2125,Volume of passenger transport relative to GDP,TTR00001,2024-07-25 21:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TTR00001,
1482,Volume of passenger transport relative to GDP,TRAN_HV_PSTRA,2024-07-25 21:00:00+00:00,2024-07-25 21:00:00+00:00,1990.0,,2022.0,,TRAN,HV_PSTRA
7162,Water use balance,ENV_WAT_BAL,2024-07-08 09:00:00+00:00,2024-07-08 09:00:00+00:00,1970.0,,2022.0,,ENV,WAT_BAL


In [22]:
df_same_name = df_title_duplicates.groupby(['ds_name'])['title'].count().reset_index(name='count')
df_same_name = df_same_name[df_same_name['count']>1]
exclude_from_duplicates = set(df_title_duplicates[df_title_duplicates['ds_name'].isin(df_same_name['ds_name'])].sort_values('ds_name')['code'])
df_title_duplicates['not_duplicate'] = df_title_duplicates['code'].apply(lambda x: x in exclude_from_duplicates )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_duplicates['not_duplicate'] = df_title_duplicates['code'].apply(lambda x: x in exclude_from_duplicates )


By looking at docs there are 2 types of statistics annual and quater.
Because this project about storing data the both of them are going to be stored in same area but in different files.

Additionally there are derived tables that have the same title, but was not marked properly.
* Volume of passenger transport relative to GDP 	TTR00001 	- derived
* Volume of passenger transport relative to GDP 	TRAN_HV_PSTRA

But at the same time there are data sets that start with TTR and not derived

In [23]:
df_title_duplicates[df_title_duplicates['not_duplicate']==False].sort_values('title')

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater,domain,ds_name,not_duplicate
4185,Aggregate replacement ratio for pensions (excluding other social benefits) by sex,TESPN070,2025-04-30 09:00:00+00:00,2025-04-30 09:00:00+00:00,2010.0,,2024.0,,TESPN070,,False
2715,Aggregate replacement ratio for pensions (excluding other social benefits) by sex,ILC_PNP3,2025-04-30 09:00:00+00:00,2025-01-27 22:00:00+00:00,2010.0,,2024.0,,ILC,PNP3,False
1500,Air transport of freight by NUTS 2 region,TRAN_R_AVGO_NM,2024-05-05 21:00:00+00:00,2024-08-29 09:00:00+00:00,1993.0,,2022.0,,TRAN,R_AVGO_NM,False
378,Air transport of freight by NUTS 2 region,TGS00078,2024-08-29 09:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TGS00078,,False
1502,Air transport of passengers by NUTS 2 region,TRAN_R_AVPA_NM,2024-05-05 21:00:00+00:00,2024-08-29 09:00:00+00:00,1993.0,,2022.0,,TRAN,R_AVPA_NM,False
...,...,...,...,...,...,...,...,...,...,...,...
3719,Unemployment rate by sex,TEILM020,2025-05-02 09:00:00+00:00,2025-04-04 09:00:00+00:00,2024.0,04,2025.0,03,TEILM020,,False
1482,Volume of passenger transport relative to GDP,TRAN_HV_PSTRA,2024-07-25 21:00:00+00:00,2024-07-25 21:00:00+00:00,1990.0,,2022.0,,TRAN,HV_PSTRA,False
2125,Volume of passenger transport relative to GDP,TTR00001,2024-07-25 21:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TTR00001,,False
7162,Water use balance,ENV_WAT_BAL,2024-07-08 09:00:00+00:00,2024-07-08 09:00:00+00:00,1970.0,,2022.0,,ENV,WAT_BAL,False


Looks like in general all dataset with underscore in their name contains raw or structured data.

Flat codes often contains pre-aggregated or simplified data.

So if the dataset have the same title, but one of entries no underscore then it is likely to be derived.
If both have underscode both saved
If no one have underscore they should be cheched.

In [24]:
def determine_duplicate(row, df : pd.DataFrame):
    if row['not_duplicate']:
        return True
    if '_' in row['code']:
        return True
    df_same_title : pd.DataFrame = df[df['title']==row['title']]
    # 
    has_underscore = df_same_title['code'].str.contains('_').any()
    if has_underscore:
        return False
    return True
    
df_title_duplicates['not_duplicate'] = df_title_duplicates.apply(lambda row: determine_duplicate(row,df_title_duplicates),axis=1)
df_title_duplicates[df_title_duplicates['not_duplicate']].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_duplicates['not_duplicate'] = df_title_duplicates.apply(lambda row: determine_duplicate(row,df_title_duplicates),axis=1)


title                          455
code                           455
last update of data            455
last table structure change    455
start_year                     455
start_quater                    83
end_year                       455
end_quater                      83
domain                         455
ds_name                        447
not_duplicate                  455
dtype: int64

In [25]:
df_title_duplicates[df_title_duplicates['not_duplicate']==False].sort_values('title')

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater,domain,ds_name,not_duplicate
4185,Aggregate replacement ratio for pensions (excluding other social benefits) by sex,TESPN070,2025-04-30 09:00:00+00:00,2025-04-30 09:00:00+00:00,2010.0,,2024.0,,TESPN070,,False
378,Air transport of freight by NUTS 2 region,TGS00078,2024-08-29 09:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TGS00078,,False
377,Air transport of passengers by NUTS 2 region,TGS00077,2024-08-29 09:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TGS00077,,False
364,Animal populations by NUTS 2 region,TGS00045,2025-03-27 22:00:00+00:00,2025-03-27 22:00:00+00:00,2013.0,,2024.0,,TGS00045,,False
383,At-risk-of-poverty rate by NUTS 2 region,TGS00103,2025-04-30 09:00:00+00:00,2025-04-30 09:00:00+00:00,2013.0,,2024.0,,TGS00103,,False
374,Available beds in hospitals by NUTS 2 region,TGS00064,2024-09-03 21:00:00+00:00,2025-01-23 22:00:00+00:00,2011.0,,2022.0,,TGS00064,,False
757,Building permits - annual data,TIPSHO50,2025-05-05 09:00:00+00:00,2025-04-14 21:00:00+00:00,2005.0,,2024.0,,TIPSHO50,,False
3692,Building permits - monthly data,TEIIS550,2025-05-05 09:00:00+00:00,2025-04-15 09:00:00+00:00,2024.0,04,2025.0,03,TEIIS550,,False
3252,EMU convergence criterion series - annual data,TEC00097,2025-01-15 10:00:00+00:00,2025-01-15 10:00:00+00:00,2013.0,,2024.0,,TEC00097,,False
3382,Employment expectations indicator,TEIBS030,2025-04-29 09:00:00+00:00,2025-04-29 09:00:00+00:00,2024.0,05,2025.0,04,TEIBS030,,False


In [26]:
df_ds = df_ds[~df_ds['code'].isin(df_title_duplicates[df_title_duplicates['not_duplicate']==False])]

In [27]:
df_ds.describe(include='all')

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater,domain,ds_name
count,7609,7609,7609,7609,7517.0,723,7517.0,827,7609,6823
unique,7337,7609,,,,20,,21,888,6624
top,Hourly earnings by economic activity and contractual working time (enterprises with 10 employed persons or more),POST_CUBE1_X$POST_DTR_1,,,,Q1,,Q4,ILC,10_F_BS
freq,4,1,,,,359,,304,501,4
mean,,,2022-05-09 14:47:38.009725184+00:00,2024-08-13 23:42:19.597318912+00:00,2005.709991,,2019.14261,,,
min,,,2009-03-26 10:00:00+00:00,2018-12-13 12:00:00+00:00,1947.0,,1983.0,,,
25%,,,2021-04-26 21:00:00+00:00,2024-01-03 22:00:00+00:00,2000.0,,2016.0,,,
50%,,,2024-12-12 22:00:00+00:00,2024-10-15 09:00:00+00:00,2007.0,,2022.0,,,
75%,,,2025-04-14 21:00:00+00:00,2025-03-27 22:00:00+00:00,2014.0,,2024.0,,,
max,,,2025-05-05 09:00:00+00:00,2025-05-05 09:00:00+00:00,2024.0,,2100.0,,,


In [28]:
df_ds['end_year'].unique()

array([2024., 2023., 2008., 2019., 2022., 1999., 2002., 2004., 2006.,
       2010., 2012., 2014., 2016., 2018., 2020., 2021., 2025., 2003.,
       2015., 2013., 2011., 2017., 2007., 2005., 2001., 2009., 2100.,
       2000., 1996., 2032., 2050.,   nan, 1998., 1997., 1991., 1995.,
       2026., 1983., 1984.])

# Check na values

In [29]:
(df_ds.isna().sum()/len(df_ds))*100

title                           0.000000
code                            0.000000
last update of data             0.000000
last table structure change     0.000000
start_year                      1.209094
start_quater                   90.498094
end_year                        1.209094
end_quater                     89.131292
domain                          0.000000
ds_name                        10.329873
dtype: float64

In [30]:
df_ds.isna().value_counts()

title  code   last update of data  last table structure change  start_year  start_quater  end_year  end_quater  domain  ds_name
False  False  False                False                        False       True          False     True        False   False      6121
                                                                                                                        True        569
                                                                            False         False     False       False   False       516
                                                                                                                        True        207
                                                                            True          False     False       False   False       104
                                                                True        True          True      True        False   False        82
                                                        

In [79]:
def setDomain(row):
    match = re.match(r"^([a-zA-Z]+)(_|-)?(.*)",row['code'])
    if match:
        gr2 = match.group(3)
        if gr2:
            return match.group(1),gr2
        else:
            return match.group(1), None
    return row['domain'],row['ds_name']

In [80]:
df_ds[['domain','ds_name']] = df_ds.apply(lambda row: pd.Series(setDomain(row)),axis=1)

In [81]:
df_ds.isna().value_counts()

title  code   last update of data  last table structure change  start_year  start_quater  end_year  end_quater  domain  ds_name
False  False  False                False                        False       True          False     True        False   False      6690
                                                                            False         False     False       False   False       723
                                                                            True          False     False       False   False       104
                                                                True        True          True      True        False   False        92
Name: count, dtype: int64

In [82]:
df_ds[df_ds[['start_year', 'start_quater', 'end_year', 'end_quater']].isna().all(axis=1)]

Unnamed: 0,title,code,last update of data,last table structure change,start_year,start_quater,end_year,end_quater,domain,ds_name
3932,Water resources: long-term annual average,TEN00001,2024-07-05 21:00:00+00:00,2024-07-05 21:00:00+00:00,,,,,TEN,00001
7168,Renewable freshwater resources - long term annual averages,ENV_WAT_LTAA,2024-07-05 21:00:00+00:00,2023-07-31 21:00:00+00:00,,,,,ENV,WAT_LTAA
7957,EU trade since 2017 by BEC/rev.5,DS-059329,2025-04-23 09:00:00+00:00,2025-04-23 09:00:00+00:00,,,,,DS,059329
7958,EU trade since 2002 by BEC/rev.4,DS-059328,2025-04-23 09:00:00+00:00,2025-04-23 09:00:00+00:00,,,,,DS,059328
7959,Total production,DS-056121,2025-02-06 10:00:00+00:00,2025-02-06 10:00:00+00:00,,,,,DS,056121
...,...,...,...,...,...,...,...,...,...,...
8052,Prices of letter mail and parcel services (USP under direct or indirect designation),POST_CUBE1_X$POST_PRI_1,2025-04-15 12:44:56+00:00,2025-04-15 12:44:56+00:00,,,,,POST,CUBE1_X$POST_PRI_1
8053,"Postal services falling under the universal service obligation (USP under direct or indirect designation, traffic)",POST_CUBE1_X$USO701,2025-04-15 12:44:56+00:00,2025-04-15 12:44:56+00:00,,,,,POST,CUBE1_X$USO701
8055,Number of enterprises providing postal services,POST_CUBE1_X$NUM701,2025-04-15 12:44:56+00:00,2025-04-15 12:44:56+00:00,,,,,POST,CUBE1_X$NUM701
8056,Access points (USP under direct or indirect designation ),POST_CUBE1_X$POST_ACC_1,2025-04-15 12:44:56+00:00,2025-04-15 12:44:56+00:00,,,,,POST,CUBE1_X$POST_ACC_1


Most of the datasets without any data about start and end are providing some summary analitical data, so they do not manupulate time period data. 

Start and end quater are available only if data set has any data period data.