# Transform data

### This notebook reproduces the methodology described in 4.1.2. Data Transformation, explicitly in 4.1.2.1 Recursive Text Splitter.

Let's first install the libraries we will use during this step:

In [None]:
pip install pyyaml pandas torch tqdm python-dotenv beautifulsoup4 tiktoken langchain

Note: you may need to restart the kernel to use updated packages.


# Setup

In [None]:
# Python version
import sys 
print(sys. version)

3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


We will need to use the stored environment variables:

In [3]:
# # Environment Variables
from dotenv import load_dotenv
import yaml
import os

# # Load env
load_dotenv()

False

# Parameters

We load the needed parameters from a yaml file.

In [7]:
# Load parameters from YAML file
os.chdir('/notebooks/TFM_LAW_LLM')
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Reference Documents

https://medium.com/@ayhamboucher/llm-based-context-splitter-for-large-documents-445d3f02b01b

https://www.pinecone.io/learn/chunking-strategies/

# Directory

In [8]:
# Set directory 
from pathlib import Path
import sys

notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
current_directory = os.getcwd()
current_directory

'C:\\Users\\polri\\Desktop\\Git_TFM\\TFM_LAW_LLM'

# Libraries

In [9]:
# General libraries

# Time data managment libraries 
import time

# Garbage Collecting library: automatically freeing up memory occupied by objects that are no longer in use by the program
import gc

# Local assets: Utils contains functions we will later use
from utils import *

# Data manipulation
import pandas as pd

# Deep learning tasks
import torch

# Progress bars
import tqdm
from tqdm.notebook import tqdm

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

In [24]:
# Start timing the notebook run
start_time = time.time()

# Device

That's how we make sure we are using GPU if possible.

In [10]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

Using device: cpu



We clean the cache so we have space in cuda.

In [11]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

20

# Load data

We load the data that we extracted and saved in the 1_extract_data.ipynb notebook.

In [12]:
# Params
local_folder_path = "/notebooks/TFM_LAW_LLM/raw_data/txt_files"
file_list = ["boe.txt"]

# List CSV files locally to see what we have, this was meant to be use in an other use case, not the one in this thesis.
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

[]

We now open the boe .txt file that was downloaded from the official site of Spanish Government, and convert it into a dataframe format.

In [14]:
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# Params
local_folder_path = "/notebooks/TFM_LAW_LLM/raw_data/txt_files"
file_list = ["boe.txt"]

Loaded boe.txt from local storage.
                                                text
0  LEY ORGÁNICA 10/1995, DE 23 DE NOVIEMBRE, DEL\...


In [None]:
# We will temporally store the data in this file
text_data = []

# Loop through any files that we may have
for file_name in file_list:
    try:
        local_file_path = os.path.join(local_folder_path, file_name)
        text = read_txt_file(local_file_path)
        print(f"Loaded {file_name} from local storage.")
        text_data.append({'text': text})

    except Exception as e:
        print(f"Error processing {file_name}: {e}")


df_txt = pd.DataFrame(text_data)
print(df_txt.head())

We print the .txt length.

In [16]:
with open('/notebooks/TFM_LAW_LLM/raw_data/txt_files/boe.txt', 'r', encoding='utf-8') as file:
    df_txt = file.read()

file_length = len(df_txt)
print("Length of the file:", file_length)

Length of the file: 731024


# Transformation Parameters

We set the maximum number of characters a chunk can have and the chunk overlap size. These parameters are stored in the configuration yaml.

In [17]:
# Maximum length of a text
max_chunk_size = config['max_chunk_size']

# Chunk overlap
chunk_overlap_size = config['chunk_overlap']

# Separators
separators = ["\n\n", "\n", ". ", " ", ""]

# Recursive Splitter

We will use recursive splitter, as it has been proved to be the best performing splitter for problems like ours.

In [18]:
# Set splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_chunk_size,
    chunk_overlap=chunk_overlap_size,
    length_function=len,
    separators=separators
)

# Auxiliar function
def recursive_text_splitter(text):
    chunks = splitter.split_text(text)
    return chunks

# Split the text into chunks
chunks = recursive_text_splitter(df_txt)

# Create a list of dictionaries containing chunked data
chunks_list = [{'text_id': f"chunk_{i+1}", 'text': chunk} for i, chunk in enumerate(chunks)]

# Create a DataFrame from the chunks list
splitted_df_v1 = pd.DataFrame(chunks_list)
splitted_df_v1.head()

Unnamed: 0,text_id,text
0,chunk_1,"LEY ORGÁNICA 10/1995, DE 23 DE NOVIEMBRE, DEL\..."
1,chunk_2,JUAN CARLOS I\nRey de España\nA todos los que ...
2,chunk_3,negativa». El Código Penal ha de tutelar los v...
3,chunk_4,"sucinto, los criterios en que se inspira, aunq..."
4,chunk_5,"menos básicos, y, de otra, introduce cambios e..."


In [19]:
# Length of file/
len(splitted_df_v1)

1068

We now save the already splitted data in folder prepared_data. We will later use it again.

In [20]:
# Save splitted data
path = 'prepared_data/'
csv_file_name_v1 = f'{path}splitted_input_base.csv'
splitted_df_v1.to_csv(csv_file_name_v1, index = False)

# Clean

In [21]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

0

# Token Count & Runtime

Let's calculate the total number of tokens that we have:

In [22]:
# Sum total tokens
total_tokens_processed = splitted_df_v1['text'].apply(lambda x: count_tokens(x)).sum()

# Show
print("Total number of tokens in the 'text' column:", total_tokens_processed)

Total number of tokens in the 'text' column: 238500


We calculate the runtime from the recursive splitter.

In [25]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")

Time elapsed: 0 hours and 0 minutes.
