# Transform data

In [1]:
pip install pyyaml pandas torch tqdm python-dotenv beautifulsoup4 tiktoken langchain

Note: you may need to restart the kernel to use updated packages.


# Setup

In [2]:
# Python version
import sys 
print(sys. version)

3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


In [3]:
# # Environment Variables
from dotenv import load_dotenv
import yaml
import os

# # Load env
load_dotenv()

False

# Parameters

We load the needed parameters from a yaml file.

In [4]:
# Load parameters from YAML file
import os

# Change the current working directory to the directory containing the YAML file
os.chdir('/notebooks/TFM/TFM_LAW_LLM')

import yaml
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Reference Documents

https://medium.com/@ayhamboucher/llm-based-context-splitter-for-large-documents-445d3f02b01b

https://www.pinecone.io/learn/chunking-strategies/

# Directory

In [5]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)

# Get the current working directory
current_directory = os.getcwd()
current_directory

'C:\\Users\\polri\\Desktop\\Git_TFM\\TFM_LAW_LLM'

# Libraries

In [6]:
# General libraries

# Time data managment libraries 
import time

# Garbage Collecting library: automatically freeing up memory occupied by objects that are no longer in use by the program
import gc

# Local assets: Utils contains functions we will later use
from utils import *

# Data manipulation
import pandas as pd

# Deep learning tasks
import torch

# Progress bars
import tqdm
from tqdm.notebook import tqdm

# Directory managment
import os

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

In [8]:
# Start timing the notebook run
start_time = time.time()

# Device

In [9]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

Using device: cpu



In [10]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

20

# Load data

We load the data that we extracted and saved in the 1_extract_data.ipynb notebook.

In [11]:
# Params
local_folder_path = "/notebooks/TFM/TFM_LAW_LLM/raw_data/txt_files"
file_list = ["boe.txt"]

# List CSV files locally to see what we have
local_csv_files = list_csv_files(local_folder_path)
local_csv_files

[]

In [12]:
import os
import pandas as pd
from tqdm import tqdm

# Function to read a text file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# Params
local_folder_path = "C:/TFM/TFM_LAW_LLM/raw_data/txt_files"
file_list = ["boe.txt"]

# List to store text data
text_data = []

# Loop through files
for file_name in file_list:
    try:
        # Load the text from the file
        local_file_path = os.path.join(local_folder_path, file_name)
        text = read_txt_file(local_file_path)
        print(f"Loaded {file_name} from local storage.")
        
        # Append text to the list
        text_data.append({'text': text})

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Create DataFrame from the list of text data
df_txt = pd.DataFrame(text_data)

# Show DataFrame
print(df_txt.head())

Loaded boe.txt from local storage.
                                                text
0  LEY ORGÁNICA 10/1995, DE 23 DE NOVIEMBRE, DEL\...


In [14]:
# Open the .txt file with the correct encoding
with open('/notebooks/TFM/TFM_LAW_LLM/raw_data/txt_files/boe.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    df_txt = file.read()

# Calculate the length of the file content
file_length = len(df_txt)

# Print the length of the file content
print("Length of the file:", file_length)

Length of the file: 731024


# Transformation Parameters

We set the maximum number of tokens a chunk can have and the chunk overlap size.

In [17]:
# Maximum length of a text
max_chunk_size = config['max_chunk_size']

# Chunk overlap
chunk_overlap_size = config['chunk_overlap']

# Separators
separators = ["\n\n", "\n", ". ", " ", ""]

# Recursive Splitter

We will use recursive splitter, as it has been proved to be the best performing splitter for problems like our.

In [18]:
# Set splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=max_chunk_size,
    chunk_overlap=chunk_overlap_size,
    length_function=len,
    separators=separators
)

# Auxiliar function
def recursive_text_splitter(text):
    chunks = splitter.split_text(text)
    return chunks

# Split the text into chunks
chunks = recursive_text_splitter(df_txt)

# Create a list of dictionaries containing chunk data
chunks_list = [{'text_id': f"chunk_{i+1}", 'text': chunk} for i, chunk in enumerate(chunks)]

# Create a DataFrame from the chunks list
splitted_df_v1 = pd.DataFrame(chunks_list)

# Show the first few rows of the DataFrame
splitted_df_v1.head()

We now split BOE in smaller chunks.

In [20]:
# Length of file/
len(splitted_df_v1)

TypeError: string indices must be integers, not 'str'

We now save the already splitted data.

In [None]:
# Save splitted data
path = 'prepared_data/'
csv_file_name_v1 = f'{path}splitted_input_base.csv'

# Write the DataFrame to a CSV file
splitted_df_v1.to_csv(csv_file_name_v1, index = False)

Above, we can see the different chunks of the BOE, among it's format.

# Clean

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

3837

# Runtime

We calculate the runtime from the recursive splitter.

In [None]:
# Sum total tokens
total_tokens_processed = splitted_df_v1['text'].apply(lambda x: count_tokens(x)).sum()

# Show
print("Total number of tokens in the 'text' column:", total_tokens_processed)

Total number of tokens in the 'text' column: 238500


In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")

Time elapsed: 0 hours and 24 minutes.
