# Transform data

# Setup

In [1]:
# %pip install -r requirements.txt

In [2]:
# Environment Variables
import os
from dotenv import load_dotenv

# Load env
load_dotenv()

True

# Google

In [3]:
# Set env path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "lawgpt-410122-140fba6fba7b.json"

# Bucket params
bucket_name = 'lawgpt_madrid_bucket_1'

# Reference

https://medium.com/@ayhamboucher/llm-based-context-splitter-for-large-documents-445d3f02b01b

https://www.pinecone.io/learn/chunking-strategies/

# Directory

In [4]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/Users/ignasipascual/GitHub/LawGPT'

# Libraries

In [3]:
# General libraries
import pandas as pd
import numpy as np
import time
import yaml
import csv
import sys
import gc
import os

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from nltk.corpus import stopwords
import torch
import spacy
import nltk

# Summarizer
from transformers import BertTokenizerFast, EncoderDecoderModel

# Cloud
from google.cloud import storage

# Other
from tqdm.notebook import tqdm

# Local
original_sys_path = sys.path.copy()
sys.path.append('../')
from functions import * 
sys.path = original_sys_path

# Warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Start timing the notebook run
start_time = time.time()

# Platform login

In [7]:
# HF Key
hf_key = os.environ.get('HF_KEY')
# print(hf_key)

In [8]:
# Jupyter / Colab
# notebook_login()

# VS Code
# Run huggingface-cli login in console

In [5]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



In [10]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

60

# Load data

In [8]:
# Params
file_list = ["boe_data.csv"]
local_folder_path = "../raw_data/"

# List CSV files locally
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

['boe_ids.csv', 'boe_filtered_ids.csv', 'boe_data.csv']

In [12]:
# Check for each file in file_list
df_txt = pd.DataFrame()

# Loop by files
for file_name in file_list:
    try:
        if file_name in local_csv_files:
            # If file found locally, load and concatenate
            local_file_path = os.path.join(local_folder_path, file_name)
            df = pd.read_csv(local_file_path)
            df['id'] = df['id'].astype(str)
            print(f"Loaded {file_name} from local storage.")
        else:
            # If file not found locally, download from GCS and then load
            # source_blob_name = f'raw_data/{file_name}'
            # destination_file_name = os.path.join(local_folder_path, file_name)
            # download_from_gcs(os.environ.get('GOOGLE_BUCKET_NAME'), source_blob_name, destination_file_name)
            # df = pd.read_csv(destination_file_name)
            print(f"Downloaded and loaded {file_name} from GCS.")

        # Concatenate or process the DataFrame as needed
        df_txt = pd.concat([df_txt, df], ignore_index=True)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Show the resulting DataFrame
df_txt.head()

Loaded boe_data.csv from local storage.


Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text
0,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,\nJUAN CARLOS I\nREY DE ESPAÑA\nA todos los qu...


In [13]:
# Length of file
len(df_txt)

1

# Clean data

In [14]:
# Calculate the character count for each row in the specified column
df_txt['character_count'] = df_txt['text'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

# Filter rows based on the character count condition
df_txt = df_txt[df_txt['character_count'] > 0].copy()

# Drop the temporary 'character_count' column
df_txt = df_txt.drop(columns=['character_count'])

In [15]:
# Length of file
len(df_txt)

1

# Parameters

In [16]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [17]:
# Maximum length of a text
max_chunk_size = config['max_chunk_size']

# Chunk overlap
chunk_overlap_size = config['chunk_overlap']

# Separators
separators = ["\n\n", "\n", ". ", " ", ""]

# Recursive Splitter

In [18]:
# Set splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = max_chunk_size,
    chunk_overlap  = chunk_overlap_size,
    length_function = len,
    separators = separators
)

In [19]:
# Auxiliar function
def recursive_text_splitter(text):
    chunks = splitter.split_text(text)
    return chunks

In [20]:
# Split each text in the 'text' column into chunks recursively
df_txt['chunks'] = df_txt['text'].apply(lambda x: recursive_text_splitter(x))

# Create a new DataFrame with individual chunks and unique identifiers
chunks_list = []
for idx, row in df_txt.iterrows():
    for i, chunk in enumerate(row['chunks']):
        chunks_list.append({
            'id': row['id'],
            'url': row['url'],
            'title': row['title'],
            'date': row['date'],
            'legislative_origin': row['legislative_origin'],
            'department': row['department'],
            'rang': row['rang'],
            'text_id': f"{row['id']}_chunk{i+1}",
            'text': chunk
        })

# Drop the 'chunks' column
df_txt = df_txt.drop(columns=['chunks'])

# Create a new DataFrame for the chunks
splitted_df_v1 = pd.DataFrame(chunks_list)

# Show
splitted_df_v1.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk1,JUAN CARLOS I\nREY DE ESPAÑA\nA todos los que ...
1,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk2,A partir de los distintos intentos de reforma ...
2,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk3,"En segundo lugar, se ha afrontado la antinomia..."
3,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk4,"En tercer lugar, se ha dado especial relieve a..."
4,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk5,"En cuarto lugar, y en consonancia con el objet..."


In [21]:
# Length of file
len(splitted_df_v1)

382

# Summarize texts

In [22]:
# Summarizer
model = config['summarizer_model']

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model)

# Model
model = EncoderDecoderModel.from_pretrained(model).to(device)

The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


In [23]:
# Function to generate summary
def generate_summary(text):    
    inputs = tokenizer([text], padding = "max_length", truncation = True, max_length = 512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    output = model.generate(input_ids, attention_mask=attention_mask)
    return tokenizer.decode(output[0], skip_special_tokens = True)

In [24]:

# New object from previous splitting process
splitted_df_v1_summary = splitted_df_v1.copy()

# Create empty column
splitted_df_v1_summary['text_summary'] = ''

# Initialize a counter for iterations
iteration_counter = 0

# Loop by row to generate one summary per text
for text_idx, text in tqdm(enumerate(splitted_df_v1_summary['text']), total=len(splitted_df_v1_summary)):
    
    # Generate summary
    summarized_text = generate_summary(text)
    
    # Assign summary
    splitted_df_v1_summary.at[text_idx, 'text_summary'] = summarized_text
    
    # Increment the iteration counter
    iteration_counter += 1
    
    # Check if it's time to perform garbage collection
    if iteration_counter % 1000 == 0:
        # Clean memory
        torch.cuda.empty_cache()
        gc.collect()
        
# Show
splitted_df_v1_summary.head()

  0%|          | 0/382 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
# Length of file
len(splitted_df_v1_summary)

382

In [26]:
# Save splitted & summarized data
path = '../prepared_data/'
csv_file_name_v4 = f'{path}splitted_input_summary.csv'

# Write the DataFrame to a CSV file
splitted_df_v1_summary.to_csv(csv_file_name_v4, index = False)

# Clean

In [None]:
# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

# Runtime

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")