# Transform data

# Setup

In [1]:
# %pip install -r requirements.txt

In [2]:
# Environment Variables
import os
from dotenv import load_dotenv

# Load env
load_dotenv()

True

# Google

In [3]:
# Set env path
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "lawgpt-410122-140fba6fba7b.json"

# Bucket params
bucket_name = 'lawgpt_madrid_bucket_1'

# Reference

https://medium.com/@ayhamboucher/llm-based-context-splitter-for-large-documents-445d3f02b01b

https://www.mongodb.com/developer/products/atlas/gemma-mongodb-huggingface-rag/

https://www.pinecone.io/learn/chunking-strategies/

# Directory

In [4]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [5]:
# General libraries
import pandas as pd
import numpy as np
import time
import yaml
import csv
import gc
import os

# Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import transformers
import accelerate

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from nltk.corpus import stopwords
import torch
import spacy
import nltk

# Torch
from torch import cuda, bfloat16, float16
import torch

# Cloud
from google.cloud import storage

# Other
from tqdm.notebook import tqdm

# Local
from functions import *

# Warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Start timing the notebook run
start_time = time.time()

# Platform login

In [9]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

Quadro P5000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [10]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

248

# Load data

In [11]:
# Params
file_list = ["boe_data.csv"]
local_folder_path = "raw_data/"

# List CSV files locally
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

['boe_ids.csv', 'boe_filtered_ids.csv', 'boe_data.csv']

In [12]:
# Check for each file in file_list
df_txt = pd.DataFrame()

# Loop by files
for file_name in file_list:
    try:
        if file_name in local_csv_files:
            # If file found locally, load and concatenate
            local_file_path = os.path.join(local_folder_path, file_name)
            df = pd.read_csv(local_file_path)
            df['id'] = df['id'].astype(str)
            print(f"Loaded {file_name} from local storage.")
        else:
            # If file not found locally, download from GCS and then load
            # source_blob_name = f'raw_data/{file_name}'
            # destination_file_name = os.path.join(local_folder_path, file_name)
            # download_from_gcs(os.environ.get('GOOGLE_BUCKET_NAME'), source_blob_name, destination_file_name)
            # df = pd.read_csv(destination_file_name)
            print(f"Downloaded and loaded {file_name} from GCS.")

        # Concatenate or process the DataFrame as needed
        df_txt = pd.concat([df_txt, df], ignore_index=True)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Show the resulting DataFrame
df_txt.head()

Loaded boe_data.csv from local storage.


Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text
0,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,\nJUAN CARLOS I\nREY DE ESPAÑA\nA todos los qu...


In [13]:
# Length of file
len(df_txt)

1

# Clean data

In [14]:
# Calculate the character count for each row in the specified column
df_txt['character_count'] = df_txt['text'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

# Filter rows based on the character count condition
df_txt = df_txt[df_txt['character_count'] > 0].copy()

# Drop the temporary 'character_count' column
df_txt = df_txt.drop(columns=['character_count'])

In [15]:
# Length of file
len(df_txt)

1

# Parameters

In [16]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [17]:
# Maximum length of a text
max_chunk_size = config['max_chunk_size']

# Chunk overlap
chunk_overlap_size = config['chunk_overlap']

# Separators
separators = ["\n\n", "\n", ". ", " ", ""]

# Recursive Splitter

In [18]:
# Set splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = max_chunk_size,
    chunk_overlap  = chunk_overlap_size,
    length_function = len,
    separators = separators
)

In [19]:
# Auxiliar function
def recursive_text_splitter(text):
    chunks = splitter.split_text(text)
    return chunks

In [20]:
# Split each text in the 'text' column into chunks recursively
df_txt['chunks'] = df_txt['text'].apply(lambda x: recursive_text_splitter(x))

# Create a new DataFrame with individual chunks and unique identifiers
chunks_list = []
for idx, row in df_txt.iterrows():
    for i, chunk in enumerate(row['chunks']):
        chunks_list.append({
            'id': row['id'],
            'url': row['url'],
            'title': row['title'],
            'date': row['date'],
            'legislative_origin': row['legislative_origin'],
            'department': row['department'],
            'rang': row['rang'],
            'text_id': f"{row['id']}_chunk{i+1}",
            'text': chunk
        })

# Drop the 'chunks' column
df_txt = df_txt.drop(columns=['chunks'])

# Create a new DataFrame for the chunks
splitted_df_v1 = pd.DataFrame(chunks_list)

# Show
splitted_df_v1.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk1,JUAN CARLOS I\nREY DE ESPAÑA\nA todos los que ...
1,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk2,A partir de los distintos intentos de reforma ...
2,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk3,"En segundo lugar, se ha afrontado la antinomia..."
3,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk4,"En cuarto lugar, y en consonancia con el objet..."
4,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk5,"En quinto lugar, se ha procurado avanzar en el..."


In [21]:
# Length of file
len(splitted_df_v1)

307

# Summarize texts

In [22]:
# Summarizer
model = config['extract_core_topics_model']

# Show model
model

'google/gemma-2b-it'

In [23]:
# Summarizer
model_id = config['extract_core_topics_model']

# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    add_eos_token = True
)

# Model
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization = bnb_config
    device_map = "auto"
)

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [38]:
# Function to generate text using the specified LLM model
def generate_text(prompt, max_length = 512, cuda = True):
    """
    Generate text based on prompt and LLM model
    Args:
    - prompt (str): The input prompt for text generation.
    - max_length (int): The maximum length of the generated text.
    - cuda (bool): If True, use GPU for computation.

    Returns:
    - str: The generated text.
    """

    # Move model and input tensors to GPU if cuda is True
    device = "cuda" if cuda and torch.cuda.is_available() else "cpu"
    model.to(device)
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length = max_length, num_return_sequences = 1)

    # Decode generated text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Return
    return generated_text

In [41]:
# Get prompt
extract_core_prompt = config["extract_core_prompt"]

# Show
extract_core_prompt

'Genera un resumen de las partes más importantes del siguiente texto, haciendo referencia continuada a los artículos mencionados y los puntos más relevantes'

In [None]:
# New object from previous splitting process
splitted_df_v1_summary = splitted_df_v1.copy()

# Create empty column
splitted_df_v1_summary['text_summary'] = ''

# Initialize a counter for iterations
iteration_counter = 0

# Loop by row to generate one summary per text
for text_idx, text in tqdm(enumerate(splitted_df_v1_summary['text']), total = len(splitted_df_v1_summary)):
    
    # Prepend the prompt to the text
    text_with_prompt = extract_core_prompt + text
    
    # Generate summary
    summarized_text = generate_text(text)
    
    # Assign summary
    splitted_df_v1_summary.at[text_idx, 'text_summary'] = summarized_text
    
    # Increment the iteration counter
    iteration_counter += 1
    
    # Check if it's time to perform garbage collection
    if iteration_counter % 1000 == 0:
        # Clean memory
        torch.cuda.empty_cache()
        gc.collect()
        
# Show
splitted_df_v1.head()

In [None]:
# Length of file
len(splitted_df_v1)

In [None]:
# Save splitted & summarized data
path = 'prepared_data/'
csv_file_name_v4 = f'{path}splitted_input_gemma.csv'

# Write the DataFrame to a CSV file
splitted_df_v1_summary.to_csv(csv_file_name_v4, index = False)

# Clean

In [None]:
# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

# Runtime

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")