# Transform data

# Setup

In [1]:
# Python version
import sys 
print(sys. version)

In [2]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [None]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
# Jupyter extensions
!jupyter nbextension enable --py widgetsnbextension

# Parameters

We load the needed parameters from a yaml file.

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Reference

https://medium.com/@ayhamboucher/llm-based-context-splitter-for-large-documents-445d3f02b01b

https://www.pinecone.io/learn/chunking-strategies/

# Directory

In [4]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [5]:
# General libraries
import pandas as pd
import numpy as np
import time
import yaml
import csv
import gc
import os

# Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import pipeline
import transformers
import accelerate

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from nltk.corpus import stopwords
import torch
import spacy
import nltk

# Tokenizer
import tiktoken

# Optimization
import xformers

# Other
from tqdm.notebook import tqdm

# Local
from functions import *

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Start timing the notebook run
start_time = time.time()

# Device

In [7]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

# Load data

In [11]:
# Params
file_list = ["boe_data.csv"]
local_folder_path = "raw_data/"

# List CSV files locally
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

['boe_ids.csv', 'boe_filtered_ids.csv', 'boe_data.csv']

In [12]:
# Check for each file in file_list
df_txt = pd.DataFrame()

# Loop by files
for file_name in file_list:
    try:
        # Load the file from local storage
        local_file_path = os.path.join(local_folder_path, file_name)
        df = pd.read_csv(local_file_path)
        df['id'] = df['id'].astype(str)
        print(f"Loaded {file_name} from local storage.")

        # Concatenate or process the DataFrame as needed
        df_txt = pd.concat([df_txt, df], ignore_index=True)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Show
df_txt.head()

Loaded boe_data.csv from local storage.


Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,\nDe conformidad con la Ley aprobada por las C...
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,\nDe conformidad con la Ley aprobada por las C...
2,1978-18551,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 31/1978, de 17 de julio, de modificación d...",1978-07-20,Estatal,Jefatura del Estado,Ley,\nDe conformidad con la Ley aprobada por las C...
3,1978-25564,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 45/1978, de 7 de octubre, por la que se mo...",1978-10-11,Estatal,Jefatura del Estado,Ley,\nDe conformidad con la Ley aprobada por las C...
4,1978-31079,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Real Decreto 3033/1978, de 15 de diciembre, po...",1978-12-25,Estatal,Ministerio de Sanidad y Seguridad Social,Real Decreto,\nLa Ley cuarenta y cinco/mil novecientos sete...


In [13]:
# Length of file
len(df_txt)

389

# Filter empty texts

In [None]:
# Calculate the character count for each row in the specified column
df_txt['character_count'] = df_txt['text'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

# Filter rows based on the character count condition
df_txt = df_txt[df_txt['character_count'] > 0].copy()

# Drop the temporary 'character_count' column
df_txt = df_txt.drop(columns=['character_count'])

In [None]:
# Length of file
len(df_txt)

388

# Transformation Parameters

In [None]:
# Maximum length of a text
max_chunk_size = config['max_chunk_size']

# Chunk overlap
chunk_overlap_size = config['chunk_overlap']

# Separators
separators = ["\n\n", "\n", ". ", " ", ""]

# Recursive Splitter

In [None]:
# Set splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = max_chunk_size,
    chunk_overlap  = chunk_overlap_size,
    length_function = len,
    separators = separators
)

In [None]:
# Auxiliar function
def recursive_text_splitter(text):
    chunks = splitter.split_text(text)
    return chunks

In [None]:
# Split each text in the 'text' column into chunks recursively
df_txt['chunks'] = df_txt['text'].apply(lambda x: recursive_text_splitter(x))

# Create a new DataFrame with individual chunks and unique identifiers
chunks_list = []
for idx, row in df_txt.iterrows():
    for i, chunk in enumerate(row['chunks']):
        chunks_list.append({
            'id': row['id'],
            'url': row['url'],
            'title': row['title'],
            'date': row['date'],
            'legislative_origin': row['legislative_origin'],
            'department': row['department'],
            'rang': row['rang'],
            'text_id': f"{row['id']}_chunk{i+1}",
            'text': chunk
        })

# Drop the 'chunks' column
df_txt = df_txt.drop(columns=['chunks'])

# Create a new DataFrame for the chunks
splitted_df_v1 = pd.DataFrame(chunks_list)

# Show
splitted_df_v1.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,1978-7433_chunk1,De conformidad con la Ley aprobada por las Cor...
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk1,De conformidad con la Ley aprobada por las Cor...
2,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk2,\nTercera. La pena de multa de diez mil a cinc...
3,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk3,Cuarta. La pena de multa de diez mil a cien mi...
4,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk4,Quinta. La pena de multa de diez mil a doscien...


In [None]:
# Length of file/
len(splitted_df_v1)

3309

In [None]:
# Save splitted data
path = 'prepared_data/'
csv_file_name_v1 = f'{path}splitted_input_base.csv'

# Write the DataFrame to a CSV file
splitted_df_v1.to_csv(csv_file_name_v1, index = False)

## Extract core from text

In [None]:
# Use text summary
extract_core_text = config["extract_core_text"]

# Show
extract_core_text

In [None]:
# Conditional generation
if extract_core_text:
    
    # Tokenizer
    core_tokenizer = transformers.AutoTokenizer.from_pretrained(
        core_model_id
    )
    
    # Set BNB configuration if quantization is enabled
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    ) if use_quantization else None
    
    # Set model
    core_model = transformers.AutoModelForCausalLM.from_pretrained(
        core_model_id,
        trust_remote_code = True,
        quantization_config = bnb_config,
        device_map = "auto"
    )
    
    # Pipeline
    core_pipeline = pipeline(
        model = core_model,
        tokenizer = core_tokenizer,
        task = 'text-generation',
        model_kwargs = {"torch_dtype": torch.bfloat16},
        return_full_text = config["core_return_full_text"],
        max_new_tokens = config["core_max_new_tokens"],
        repetition_penalty = config["core_repetition_penalty"],
        temperature = config["core_temperature"],
        pad_token_id = core_tokenizer.eos_token_id,
        truncation = True,
        batch_size = 1
    )
    
    # Create object for loop
    splitted_df_v1_summary = splitted_df_v1.copy()

    # Initialize variables for token count and time measurement
    total_tokens_processed = 0
    start_time = time.time()

    # Iteration counter
    iteration_counter = 0

    # Loop by row to generate one summary per text
    for text_idx, core_context in tqdm(enumerate(splitted_df_v1_summary['text']), total=len(splitted_df_v1_summary)):
        
        # Generate summary
        summarized_text = generate_summary(core_pipeline, core_tokenizer, core_prompt, core_context)
        
        # Assign summary
        splitted_df_v1_summary.at[text_idx, 'core'] = summarized_text
        
        # Increment the iteration counter
        iteration_counter += 1
        
        # Check if it's time to perform garbage collection
        if iteration_counter % 1000 == 0:
            # Clean memory
            torch.cuda.empty_cache()
            gc.collect()

    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Final format
    splitted_df_v1_summary = splitted_df_v1_summary.rename(
        columns={'text': 'original_text', 'core': 'text'}
    )

    # Save splitted & summarized data
    path = 'prepared_data/'
    csv_file_name_v1_summary = f'{path}splitted_input_core.csv'

    # Write the DataFrame to a CSV file
    splitted_df_v1_summary.to_csv(csv_file_name_v1_summary, index=False)

    # Show
    splitted_df_v1_summary.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,1978-7433_chunk1,"Ley 17/1978, de 15 de marzo, sobre modificació..."
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk1,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
2,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk2,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
3,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk3,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
4,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk4,"Ley 20/1978, de 8 de mayo, sobre modificación ..."


# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,1978-7433_chunk1,De conformidad con la Ley aprobada por las Cor...
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk1,De conformidad con la Ley aprobada por las Cor...
2,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk2,entenderá en lo sucesivo de mil a veinte mil p...
3,1978-18551,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 31/1978, de 17 de julio, de modificación d...",1978-07-20,Estatal,Jefatura del Estado,Ley,1978-18551_chunk1,De conformidad con la Ley aprobada por las Cor...
4,1978-25564,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 45/1978, de 7 de octubre, por la que se mo...",1978-10-11,Estatal,Jefatura del Estado,Ley,1978-25564_chunk1,De conformidad con la Ley aprobada por las Cor...


# Runtime

In [None]:
# Sum total tokens
total_tokens_processed = splitted_df_v1['text'].apply(lambda x: count_tokens(x)).sum()

# Show
print("Total number of tokens in the 'text' column:", total_tokens_processed)

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")