In [1]:
import os
import pandas as pd
import datetime as dt

# Define the folder path
folder_path = 'celex_queries'

# Create an empty DataFrame to store the data
combined_df = pd.DataFrame()

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

In [2]:
# Ensure date column is in datetime format
combined_df['Date of document'] = pd.to_datetime(combined_df['Date of document'], errors='coerce')

# Drop duplicate CELEX numbers, keeping the first occurrence
unique_df = combined_df.drop_duplicates(subset='CELEX number')

# Now filter based on the year in the date column
Pre2020 = unique_df[unique_df['Date of document'].dt.year < 2020]
After2020 = unique_df[unique_df['Date of document'].dt.year > 2020]


In [36]:
unique_df.shape

(579, 18)

In [37]:
Pre2020['CELEX number'].to_csv('searches_2000-2019.csv', index= False)
After2020['CELEX number'].to_csv('searches_2020-2024.csv', index= False)

## Random Selection
This randomly selects 3 documents from each year from 2000-2025.

In [24]:
def stratified_sample_by_year(df, date_col='Date of document', n_samples=3, random_state=None):
    """
    Stratified random sample of n_samples per year from the dataframe.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        date_col (str): The name of the column containing the date.
        n_samples (int): Number of samples to draw per year.
        random_state (int, optional): Seed for reproducibility.

    Returns:
        pd.DataFrame: Stratified sampled DataFrame.
    """
    df = df.copy()
    
    # Ensure the date column is datetime format
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    
    # Drop rows where date conversion failed
    df = df.dropna(subset=[date_col])
    
    # Extract year
    df['year'] = df[date_col].dt.year
    
    # Perform stratified sampling - adding include_groups=False to fix deprecation warning
    sampled_df = (
        df.groupby('year', group_keys=False)
        .apply(lambda x: x.sample(n=min(n_samples, len(x)), random_state=random_state), include_groups=False)
    )
    
    return sampled_df

# Usage
# sampled_df = stratified_sample_by_year(unique_df, date_col='Date of document', n_samples=3, random_state
# Usage
sampled_df = stratified_sample_by_year(unique_df, date_col='Date of document', n_samples=3, random_state=42)

sampled_df.shape


(75, 18)

In [25]:

# Add 'EU_' prefix to CELEX numbers
sampled_df['Document'] = 'EU_' + sampled_df['CELEX number'].astype(str)

In [27]:
sampled_df[['Document', 'Date of document']].to_csv('random_sample.csv', index= False)


# The following queries have less than 100 returns 
**This means you can can run them without splitting the queries otherwise you need to wait a couple days to get the information**

3. Hydro 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive") AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2024 AND 
(TE ~("hydro-electric" OR hydroelectric OR (hydro NEAR10 dam) OR "water-power" OR waterpower OR 
hydropower OR (pumped NEAR10 hydro) OR (pumped NEAR10 storage) OR "hydro-energy" OR 
(hydro NEAR10 energy) OR (hydro NEAR10 generation) OR (hydro NEAR10 capacity) OR (hydro 
NEAR10 penetration) OR (hydro NEAR10 share) OR (hydro NEAR10 plant) OR (hydro NEAR10 
project) OR ("small-scale" NEAR10 hydro))) 
 
4. Wind 


DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2024 AND 
(TE ~(wind NEAR10 energy) OR (wind NEAR10 power) OR (wind NEAR10 electricity) OR (wind 
NEAR10 generation) OR (wind NEAR10 capacity) OR (wind NEAR10 penetration) OR (wind NEAR10 
share) OR (wind NEAR10  plant) OR (wind NEAR10 system) OR (offshore NEAR10 wind) OR (onshore 
NEAR10 wind) OR (wind NEAR10 turbine) OR (wind NEAR10 mill)) 
 
5. Solar 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2024 AND 
(TE ~((solar NEAR10 energy) OR (solar NEAR10 cell) OR (solar NEAR10 panel) OR photovoltaic* OR 
(solar NEAR10 PV) OR (PV NEAR10 system) OR (photovoltaic NEAR10 project) OR (solar NEAR10 
power) OR (solar NEAR10 electricity) OR (solar NEAR10 generation) OR (solar NEAR10 capacity) OR 
(solar NEAR10 share) OR (solar NEAR10 plant) OR (open-field NEAR10 PV) OR (rooftop NEAR10 
photovoltaic) OR (utility NEAR10 photovoltaic) OR (building-integrated NEAR10 photovoltaic) OR (solar 
NEAR10 collector) OR (solar NEAR10 heat) OR (solar NEAR10 cool) OR (solar NEAR10 light) OR 
(solar NEAR10 market) OR (solar NEAR10 project) OR (photovoltaic NEAR10 system) OR (PV 
NEAR10 array) OR (solar NEAR10 array) OR (photovoltaic NEAR10 cell) OR (PV NEAR10 cell))) 
 
6. Fuell cell (hydrogen) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2024 AND 
(TE ~(hydrogen NEAR10 fuel) OR (hydrogen NEAR10 energy) OR (fuel NEAR10 cell) OR (biomass 
NEAR10 gasification) OR electrolysis OR electrofuel) 

11. Heating and cooling 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2024 AND 
(TE ~("Solar air condition" OR (waste NEAR10 heat) OR (heat NEAR10 recovery) OR (Ocean NEAR10 
Thermal NEAR10 Energy NEAR10 Conversion) OR OTEC OR (combined NEAR10 heat NEAR10 
power))) 

# The following queries have more than 100 returns 

1. Renewables 

TE~ (renewable NEAR10 energy) OR (alternative NEAR10 energy) OR (low-carbon NEAR10 energy) 
OR (non-fossil NEAR10 energy) OR (sustainable NEAR10 energy) OR (clean NEAR10 energy) OR 
(green NEAR10 energy) OR (low-carbon NEAR10 energy) OR (renewable NEAR10 electricity) OR 
(alternative NEAR10 electricity) OR (low-carbon NEAR10 electricity) OR (energy NEAR10 efficiency) 
OR (energy NEAR10 innovation) OR (energy NEAR10 technology) OR (energy NEAR10 efficiency) OR 
(renewable NEAR10 resources) OR (decarbonisation) OR (energy NEAR10 act)  AND DTS_SUBDOM 
= LEGISLATION AND FM ~("Regulation” OR "Directive") AND AU_CODED = EP NOT 
FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2010

TE~ (renewable NEAR10 energy) OR (alternative NEAR10 energy) OR (low-carbon NEAR10 energy) 
OR (non-fossil NEAR10 energy) OR (sustainable NEAR10 energy) OR (clean NEAR10 energy) OR 
(green NEAR10 energy) OR (low-carbon NEAR10 energy) OR (renewable NEAR10 electricity) OR 
(alternative NEAR10 electricity) OR (low-carbon NEAR10 electricity) OR (energy NEAR10 efficiency) 
OR (energy NEAR10 innovation) OR (energy NEAR10 technology) OR (energy NEAR10 efficiency) OR 
(renewable NEAR10 resources) OR (decarbonisation) OR (energy NEAR10 act)  AND DTS_SUBDOM 
= LEGISLATION AND FM ~("Regulation” OR "Directive") AND AU_CODED = EP NOT 
FM_CODED = CORRIGENDUM AND DD >= 01/01/2011 <= 31/12/2018

TE~ (renewable NEAR10 energy) OR (alternative NEAR10 energy) OR (low-carbon NEAR10 energy) 
OR (non-fossil NEAR10 energy) OR (sustainable NEAR10 energy) OR (clean NEAR10 energy) OR 
(green NEAR10 energy) OR (low-carbon NEAR10 energy) OR (renewable NEAR10 electricity) OR 
(alternative NEAR10 electricity) OR (low-carbon NEAR10 electricity) OR (energy NEAR10 efficiency) 
OR (energy NEAR10 innovation) OR (energy NEAR10 technology) OR (energy NEAR10 efficiency) OR 
(renewable NEAR10 resources) OR (decarbonisation) OR (energy NEAR10 act)  AND DTS_SUBDOM 
= LEGISLATION AND FM ~("Regulation” OR "Directive") AND AU_CODED = EP NOT 
FM_CODED = CORRIGENDUM AND DD >= 01/01/2019 <= 31/12/2024
 
2. Climate 

(TE~ (climate NEAR10 change) OR (climate NEAR10 commitment) OR (climate NEAR10 justice) OR 
(climate NEAR10 legislation) OR (greenhouse NEAR10 warming) OR (greenhouse NEAR10 gas) OR 
(greenhouse NEAR10 effect) OR (global NEAR10 warming) OR (carbon NEAR10 tax) OR (carbon 
NEAR10 footprint) OR eco-efficiency OR “kyoto protocol” OR methane OR “nitrous oxide” OR “sea level 
rise”) AND DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive") 
AND AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2010

(TE~ (climate NEAR10 change) OR (climate NEAR10 commitment) OR (climate NEAR10 justice) OR 
(climate NEAR10 legislation) OR (greenhouse NEAR10 warming) OR (greenhouse NEAR10 gas) OR 
(greenhouse NEAR10 effect) OR (global NEAR10 warming) OR (carbon NEAR10 tax) OR (carbon 
NEAR10 footprint) OR eco-efficiency OR “kyoto protocol” OR methane OR “nitrous oxide” OR “sea level 
rise”) AND DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive") 
AND AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2011 <= 31/12/2015

(TE~ (climate NEAR10 change) OR (climate NEAR10 commitment) OR (climate NEAR10 justice) OR 
(climate NEAR10 legislation) OR (greenhouse NEAR10 warming) OR (greenhouse NEAR10 gas) OR 
(greenhouse NEAR10 effect) OR (global NEAR10 warming) OR (carbon NEAR10 tax) OR (carbon 
NEAR10 footprint) OR eco-efficiency OR “kyoto protocol” OR methane OR “nitrous oxide” OR “sea level 
rise”) AND DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive") 
AND AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2016 <= 31/12/2021

(TE~ (climate NEAR10 change) OR (climate NEAR10 commitment) OR (climate NEAR10 justice) OR 
(climate NEAR10 legislation) OR (greenhouse NEAR10 warming) OR (greenhouse NEAR10 gas) OR 
(greenhouse NEAR10 effect) OR (global NEAR10 warming) OR (carbon NEAR10 tax) OR (carbon 
NEAR10 footprint) OR eco-efficiency OR “kyoto protocol” OR methane OR “nitrous oxide” OR “sea level 
rise”) AND DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive") 
AND AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2022 <= 31/12/2024


7. Distributed generation 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2019 AND 
(TE ~((distributed NEAR10 generation) OR (distributed NEAR10 solar) OR "net-metering" OR "feed-in-
tariff")) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2020 <= 31/12/2024 AND 
(TE ~((distributed NEAR10 generation) OR (distributed NEAR10 solar) OR "net-metering" OR "feed-in-
tariff")) 

 
8. Batteries and storage 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2009 AND 
(TE ~lithium-ion  OR flywheel OR (electric NEAR10 charge) OR (charging NEAR10 station) OR 
(geothermal NEAR10 energy) OR (geothermal NEAR10 heating) OR (heat NEAR10 pumps) OR (direct 
NEAR10 air NEAR10 capture) OR (charging NEAR10 infrastructure) OR (electricity NEAR10 storage) 
OR (energy NEAR10 storage) OR (storage NEAR10 renewable) OR (storage NEAR10 renewable) OR 
(storage NEAR10 grid) OR (pumped NEAR10 hydro) OR (lead NEAR10 acid) OR (flow NEAR10 
battery) OR (thermal NEAR10 storage) OR (battery NEAR10 storage) OR (heat NEAR10 storage) OR 
(cold NEAR10 storage) OR (pumped NEAR10 storage  NEAR10 projects) OR battery) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2010 <= 31/12/2020 AND 
(TE ~lithium-ion  OR flywheel OR (electric NEAR10 charge) OR (charging NEAR10 station) OR 
(geothermal NEAR10 energy) OR (geothermal NEAR10 heating) OR (heat NEAR10 pumps) OR (direct 
NEAR10 air NEAR10 capture) OR (charging NEAR10 infrastructure) OR (electricity NEAR10 storage) 
OR (energy NEAR10 storage) OR (storage NEAR10 renewable) OR (storage NEAR10 renewable) OR 
(storage NEAR10 grid) OR (pumped NEAR10 hydro) OR (lead NEAR10 acid) OR (flow NEAR10 
battery) OR (thermal NEAR10 storage) OR (battery NEAR10 storage) OR (heat NEAR10 storage) OR 
(cold NEAR10 storage) OR (pumped NEAR10 storage  NEAR10 projects) OR battery) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2021 <= 31/12/2024 AND 
(TE ~lithium-ion  OR flywheel OR (electric NEAR10 charge) OR (charging NEAR10 station) OR 
(geothermal NEAR10 energy) OR (geothermal NEAR10 heating) OR (heat NEAR10 pumps) OR (direct 
NEAR10 air NEAR10 capture) OR (charging NEAR10 infrastructure) OR (electricity NEAR10 storage) 
OR (energy NEAR10 storage) OR (storage NEAR10 renewable) OR (storage NEAR10 renewable) OR 
(storage NEAR10 grid) OR (pumped NEAR10 hydro) OR (lead NEAR10 acid) OR (flow NEAR10 
battery) OR (thermal NEAR10 storage) OR (battery NEAR10 storage) OR (heat NEAR10 storage) OR 
(cold NEAR10 storage) OR (pumped NEAR10 storage  NEAR10 projects) OR battery) 

9. Electric vehicles 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2015 AND 
(TE ~ (electric NEAR10 vehicle) OR (electric NEAR10  bike) OR (electric NEAR10 bicycle) OR (electric 
NEAR10 scooter) OR (electric NEAR10 motorbike) OR (electric NEAR10 motorcycle) OR (electric 
NEAR10 car) OR (electric NEAR10 sedan) OR (electric NEAR10 SUV) OR (electric NEAR10 pick-up) 
OR (electric NEAR10 truck) OR (electric NEAR10 semi) OR (electric NEAR10 van) OR (electric 
NEAR10 bus) OR (electric NEAR10 autobus) OR (electric NEAR10 drive-train) OR (electric NEAR10 
engine) OR (electrified NEAR10 vehicle) OR (electrified NEAR10 bike) OR (electrified NEAR10 bicycle) 
OR (electrified NEAR10 scooter) OR (electrified NEAR10 motorbike) OR (electrified NEAR10 
motorcycle) OR (electrified NEAR10 car) OR (electrified NEAR10 sedan) OR (electrified NEAR10 SUV) 
OR (electrified NEAR10 pick-up) OR (electrified NEAR10 truck) OR (vehicle NEAR10 electrification) OR 
(vehicle-to-grid NEAR10 algorithms) OR (vehicle-to-grid NEAR10 storage) OR (grid to vehicle) OR 
(hybrid NEAR10 vehicle) OR (hybrid NEAR10 car) OR (hybrid NEAR10 SUV) OR (hybrid NEAR10 bus) 
OR (hybrid NEAR10 pick-up) OR (hybrid NEAR10 truck) OR (hybrid NEAR10 electric NEAR10 drive)) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2016 <= 31/12/2024 AND 
(TE ~ (electric NEAR10 vehicle) OR (electric NEAR10  bike) OR (electric NEAR10 bicycle) OR (electric 
NEAR10 scooter) OR (electric NEAR10 motorbike) OR (electric NEAR10 motorcycle) OR (electric 
NEAR10 car) OR (electric NEAR10 sedan) OR (electric NEAR10 SUV) OR (electric NEAR10 pick-up) 
OR (electric NEAR10 truck) OR (electric NEAR10 semi) OR (electric NEAR10 van) OR (electric 
NEAR10 bus) OR (electric NEAR10 autobus) OR (electric NEAR10 drive-train) OR (electric NEAR10 
engine) OR (electrified NEAR10 vehicle) OR (electrified NEAR10 bike) OR (electrified NEAR10 bicycle) 
OR (electrified NEAR10 scooter) OR (electrified NEAR10 motorbike) OR (electrified NEAR10 
motorcycle) OR (electrified NEAR10 car) OR (electrified NEAR10 sedan) OR (electrified NEAR10 SUV) 
OR (electrified NEAR10 pick-up) OR (electrified NEAR10 truck) OR (vehicle NEAR10 electrification) OR 
(vehicle-to-grid NEAR10 algorithms) OR (vehicle-to-grid NEAR10 storage) OR (grid to vehicle) OR 
(hybrid NEAR10 vehicle) OR (hybrid NEAR10 car) OR (hybrid NEAR10 SUV) OR (hybrid NEAR10 bus) 
OR (hybrid NEAR10 pick-up) OR (hybrid NEAR10 truck) OR (hybrid NEAR10 electric NEAR10 drive)) 


10. Biothermal bioenergy 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2000 <= 31/12/2019 AND 
(TE ~(biofuel* OR bioenergy OR (cellulosic NEAR10 ethanol) OR (cellulosic NEAR10 technology) OR 
biogas OR biodiesel OR (energy NEAR10 crop) OR (anaerobic NEAR10 digester) OR (landfill NEAR10 
gas) OR (wood NEAR10 waste) OR (agriculture NEAR10 waste) OR (agricultural NEAR10 waste) OR 
(ethanol NEAR10 fuel) OR (ethanol NEAR10 gasolines) OR (corn NEAR10 ethanol) OR (sugar NEAR10 
ethanol) OR (forest NEAR10 biomass) OR (cellulosic NEAR10 biomass) OR (waste NEAR10 biomass))) 

DTS_SUBDOM = LEGISLATION AND FM ~("Regulation” OR "Directive" ) AND 
AU_CODED = EP NOT FM_CODED = CORRIGENDUM AND DD >= 01/01/2020 <= 31/12/2024 AND 
(TE ~(biofuel* OR bioenergy OR (cellulosic NEAR10 ethanol) OR (cellulosic NEAR10 technology) OR 
biogas OR biodiesel OR (energy NEAR10 crop) OR (anaerobic NEAR10 digester) OR (landfill NEAR10 
gas) OR (wood NEAR10 waste) OR (agriculture NEAR10 waste) OR (agricultural NEAR10 waste) OR 
(ethanol NEAR10 fuel) OR (ethanol NEAR10 gasolines) OR (corn NEAR10 ethanol) OR (sugar NEAR10 
ethanol) OR (forest NEAR10 biomass) OR (cellulosic NEAR10 biomass) OR (waste NEAR10 biomass))) 


 
