# Extract data

# Setup

In [1]:
# Python version
import sys 
print(sys. version)

3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


By using torch, we will be able to work on a flexible and efficient framework specialized in building and training of deep learning models.

In [2]:
# Torch config
import torch
from torch import bfloat16, cuda, float16

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

We set up our environment

In [3]:
# Environment Variables
import os
from dotenv import load_dotenv

# Load env
load_dotenv()

ModuleNotFoundError: No module named 'dotenv'

# Parameters

We load the needed parameters from a yaml file.

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Full extraction param
full_extraction = config["full_extraction"]

# Show
full_extraction   

We set the number of CPU's we will use.

In [None]:
from multiprocessing import cpu_count
num_cpus = cpu_count() - 4

# Print
num_cpus

# Directory

In [None]:
# Set directory to file location
from pathlib import Path

notebook_location = Path(os.path.abspath(""))
os.chdir(notebook_location)

# Get the current working directory
current_directory = os.getcwd()
current_directory

# Libraries

Let's now import all the libraries that will be needed during this notebook.

In [None]:
# General libraries

# Time data managment libraries 
import datetime
import time

# Garbage Collecting library: automatically freeing up memory occupied by objects that are no longer in use by the program
import gc

# Local assets: Utils contains functions we will later use
from utils import *

In [None]:
# Warnings: We will ignore them
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Start timing the notebook run
start_time = time.time()

# BOE: Boletín Oficial del Estado

We create a set of dates which we will use for the name combinations of files to download.

In [None]:
# Parameters
start_date = datetime.date(1978, 1, 1)

# Define the end date to extract
end_date = datetime.datetime.now().date()

# BOE Class
boe_class = ["A", "C"]

# Years to extract
years = [str(i) for i in range(1978, 2025)]

# Path to store full extraction
folder_path = "raw_data/boe_year/"

# 1. Find local files

In [None]:
# Params
file_list = ["boe_ids.csv"]
local_folder_path = "raw_data/"

# List CSV files locally
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

['boe_ids.csv']

# 2. Identify existing URLs from daily summaries

By using 'extract_boe_ids' function, we try all the name combinations with the set of dates we have and download all the files from BOE.

In [None]:
# BOE IDs filename
boe_ids_filename = 'raw_data/boe_ids.csv'

In [None]:
# Update BOE IDs
boe_ids = extract_boe_ids(start_date, end_date, boe_class, boe_ids_filename)

Processing days:   0%|          | 0/14437 [00:00<?, ?it/s]

In [None]:
# Show
boe_ids.head()

In [None]:
# Length of file
len(boe_ids)

# 3. Identify relevant keywords from extracted URLs and titles

Let's first show the exctraction parameter.

In [None]:
full_extraction

In [None]:
if full_extraction:
    # Full extraction IDs
    boe_filtered_ids = boe_ids.copy()
else:
    # Partial extraction keywords
    key_words = config["key_words"]
    # BOE filtered IDs filename
    boe_filtered_ids_name = "raw_data/boe_filtered_ids.csv"
    # Identify with keywords
    boe_filtered_ids = find_titles_with_keywords(boe_ids, key_words)

In [None]:
# Show
boe_filtered_ids.head()

In [None]:
# Length of file
len(boe_filtered_ids)

In [None]:
boe_filtered_ids = boe_filtered_ids.dropna(subset=['title'])

In [None]:
len(boe_filtered_ids)

# 4. Extract publications from identified URLs

Now, byusing the extract_boe_year function, we extract all the available publications from BOE in the range of years selected.

In [None]:
folder_path

In [None]:
print(f"Processing files in folder '{folder_path}' using {num_cpus} CPU cores.")

In [None]:
# Call the function
extract_boe_year(folder_path, boe_filtered_ids, years, num_cpus)

# 5. Append results

In [None]:
# List CSV files
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Initialize an empty DataFrame to store the concatenated data
boe_concat = pd.DataFrame()

# Loop through the CSV files and concatenate them
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    boe_concat = pd.concat([boe_concat, df], ignore_index=True)

The following dataframe, shows how we will save the extracted data.

In [None]:
# Show
boe_concat.head()

In [None]:
# Length of file
len(boe_concat)

# 6. Format results

As data does not exactly look as we want, we now proceed to clean it.

In [None]:
# BOE format filename
boe_format_name = 'raw_data/boe_data.csv'

In [None]:
# Input for formatting
boe_format = boe_concat.copy()

# Create id column
boe_format["date"] = pd.to_datetime(boe_format["id"], format="%Y%m%d")

# Drop column
boe_format = boe_format.drop(["id"], axis=1)

# Regular expression to capture the year and the ID
regex_pattern = r"BOE-[ABC]-(\d{4})-(\d+)"

# Extract the year and ID and concatenate them into a new column
boe_format["id"] = boe_format["url"].str.extract(regex_pattern).agg("-".join, axis=1)

# Filter columns
cols = [
    "id",
    "url",
    "title",
    "date",
    "legislative_origin",
    "department",
    "rang",
    "text",
]
boe_format = boe_format[cols]

# Sort by date
boe_format = boe_format.sort_values(by="date", ascending=True)

Now, the cleansed dataframe looks like this:

In [None]:
# Show
boe_format.head()

In [None]:
# Length of file
len(boe_format)

In [None]:
# Save output
boe_format.to_csv(boe_format_name, index=False)

# Clean Memory

In [None]:
# Clean memory
gc.collect()

# Runtime

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")