# Extract data

# Setup

In [None]:
# Python version
import sys 
print(sys. version)

By using torch, we will be able to work on a flexible and efficient framework specialized in building and training of deep learning models.

In [1]:
# Torch config
import torch
from torch import bfloat16, cuda, float16

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

We set up our environment

In [2]:
# Environment Variables
import os
from dotenv import load_dotenv

# Load env
load_dotenv()

True

# Digital Ocean

With Digital Ocan, which will be our computational framework, we will be able to perform the RAG of the LLM's, since a computer is not enough for the work we need to do.

In [3]:
# Set env path
os.environ["DO_USERNAME"] = "doadmin"
os.environ["DO_PASSWORD"] = "law-gpt-postgresql-cluster-do-user-15523133-0.c.db.ondigitalocean.com"
os.environ["DO_HOST"] = "host"
os.environ["DO_PORT"] = "25060"
os.environ["DO_DATABASE"] = "document_keys"
os.environ["DO_SSL_MODE"] = "require"

# Parameters

We load the needed parameters from a yaml file.

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Full extraction param
full_extraction = config["full_extraction"]

# Show
full_extraction   

# Directory

In [None]:
# Set directory to file location
import sys
from pathlib import Path

notebook_location = Path(os.path.abspath(""))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

# Libraries

In [None]:
# General libraries
import csv
import datetime
import gc
import json
import logging
import os
import re
import string
import time
import unicodedata
import xml
from concurrent.futures import ThreadPoolExecutor
from functools import partial

# Multiprocessing
from multiprocessing import Lock, Manager, Pool, Process, Queue, Value

import bs4
import lxml
import numpy as np
import pandas as pd
import requests
import unidecode

# Optimization
import xformers

# Scrapping
from bs4 import BeautifulSoup

# Local
from functions import *

# Other
from tqdm.notebook import tqdm

In [None]:
# Warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Start timing the notebook run
start_time = time.time()

We can try to select documents from 'Codigo Penal only'

In [None]:
key_words = ["codigo penal"]

# BOE: Boletín Oficial del Estado

We create a set of dates which we will use for the name combinations of files to download.

In [10]:
# Parameters
start_date = datetime.date(1978, 1, 1)

# Define the end date to extract
end_date = datetime.datetime.now().date()

# BOE Class
boe_class = ["A", "C"]

# Years to extract
years = [str(i) for i in range(1978, 2025)]

# Path to store full extraction
folder_path = "raw_data/boe_year/"

# Multiprocessing
# n_cores = os.cpu_count() - 2
n_cores = 1
pool = Pool(processes=n_cores)

# 1. Find local files

In [11]:
# Params
file_list = ["boe_ids.csv"]
local_folder_path = "raw_data/"

# List CSV files locally
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

['boe_ids.csv']

# 2. Identify existing URLs from daily summaries

By using 'extract_boe_ids' function, we try all the name combinations with the set of dates we have and download all the files from BOE.

In [12]:
# BOE IDs filename
boe_ids_filename = 'raw_data/boe_ids.csv'

In [None]:
# Update BOE IDs
boe_ids = extract_boe_ids(start_date, end_date, boe_class, boe_ids_filename)

Processing days:   0%|          | 0/14437 [00:00<?, ?it/s]

In [None]:
# Show
boe_ids.head()

In [None]:
# Length of file
len(boe_ids)

# 3. Identify relevant keywords from extracted URLs and titles

In [None]:
if full_extraction:
    # Full extraction IDs
    boe_filtered_ids = boe_ids.copy()
else:
    # Partial extraction keywords
    key_words = config["key_words"]
    # BOE filtered IDs filename
    boe_filtered_ids_name = "raw_data/boe_filtered_ids.csv"
    # Identify with keywords
    boe_filtered_ids = find_titles_with_keywords(boe_ids, key_words)

In [None]:
# Show
boe_filtered_ids.head()

In [None]:
# Length of file
len(boe_filtered_ids)

# 4. Extract publications from identified URLs

In [None]:
# Call the function
extract_boe_year(folder_path, boe_filtered_ids, years)

# 5. Append results

In [None]:
# List CSV files
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Initialize an empty DataFrame to store the concatenated data
boe_concat = pd.DataFrame()

# Loop through the CSV files and concatenate them
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    boe_concat = pd.concat([boe_concat, df], ignore_index=True)

In [None]:
# Show
boe_concat.head()

In [None]:
# Length of file
len(boe_concat)

# 6. Format results

In [None]:
# BOE format filename
boe_format_name = 'raw_data/boe_data.csv'

In [None]:
# Input for formatting
boe_format = boe_concat.copy()

# Create id column
boe_format["date"] = pd.to_datetime(boe_format["id"], format="%Y%m%d")

# Drop column
boe_format = boe_format.drop(["id"], axis=1)

# Regular expression to capture the year and the ID
regex_pattern = r"BOE-[ABC]-(\d{4})-(\d+)"

# Extract the year and ID and concatenate them into a new column
boe_format["id"] = boe_format["url"].str.extract(regex_pattern).agg("-".join, axis=1)

# Filter columns
cols = [
    "id",
    "url",
    "title",
    "date",
    "legislative_origin",
    "department",
    "rang",
    "text",
]
boe_format = boe_format[cols]

# Sort by date
boe_format = boe_format.sort_values(by="date", ascending=True)

In [None]:
# Show
boe_format.head()

In [None]:
# Length of file
len(boe_format)

In [None]:
# Save output
boe_format.to_csv(boe_format_name, index=False)

# Clean Memory

In [None]:
# Clean memory
gc.collect()

# Runtime

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")