# Encode data

# Setup

In [1]:
# Python version
import sys 
print(sys. version)

In [2]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [None]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
# Jupyter extensions
!jupyter nbextension enable --py widgetsnbextension

# Parameters

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Reference

- https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/generation/llm-field-guide/llama-2/llama-2-13b-retrievalqa.ipynb#scrollTo=yjs-uPXBrnQs
- https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb
- https://www.youtube.com/watch?v=LhnCsygAvzY&ab_channel=JamesBriggs
- https://huggingface.co/spaces/mteb/leaderboard

# Directory

In [3]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [4]:
# General
import numpy as np
import pinecone
import time
import yaml
import gc
import os

# Plots
import matplotlib.pyplot as plt

# HuggingFace
from huggingface_hub import notebook_login

# Transformers
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from datasets import load_dataset
import transformers

# Pinecone
from pinecone import Pinecone

# Optimization
import xformers

# Tiktoken
import tiktoken

# Other
from tqdm.notebook import tqdm

# Local
from functions import *

  from tqdm.autonotebook import tqdm


In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Start timing the notebook run
start_time = time.time()

# Device

In [6]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

Using device: cpu



# Load data

In [10]:
# Params
folder_path = "prepared_data/"
file_name = "splitted_input_base.csv"
file_path = f'{folder_path}{file_name}'

# Read csv
df_txt = pd.read_csv(file_path)

# Format
df_txt['id'] = df_txt['id'].astype(str)
df_txt['text_id'] = df_txt['text_id'].astype(str)

# Show
df_txt.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,1978-7433_chunk1,"Ley 17/1978, de 15 de marzo, sobre modificació..."
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk1,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
2,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk2,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
3,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk3,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
4,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk4,"Ley 20/1978, de 8 de mayo, sobre modificación ..."


In [11]:
# Length of file
len(df_txt)

3309

# Filter for Pinecone insert

## Number of chunks

In [14]:
# Maximum number of chunks
max_number_chunks = 100000

In [15]:
# Limit the DataFrame size
df_txt = df_txt.head(max_number_chunks)

In [16]:
# Show
df_txt.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1978-7433,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 17/1978, de 15 de marzo, sobre modificació...",1978-03-18,Estatal,Jefatura del Estado,Ley,1978-7433_chunk1,"Ley 17/1978, de 15 de marzo, sobre modificació..."
1,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk1,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
2,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk2,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
3,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk3,"Ley 20/1978, de 8 de mayo, sobre modificación ..."
4,1978-12341,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley 20/1978, de 8 de mayo, sobre modificación ...",1978-05-09,Estatal,Jefatura del Estado,Ley,1978-12341_chunk4,"Ley 20/1978, de 8 de mayo, sobre modificación ..."


In [17]:
# Length of file
len(df_txt)

3309

## Byte size per vector

In [18]:
# Maximum bytes
max_bytes = 40000

In [19]:
df_txt = filter_df_by_byte_count(df_txt, max_bytes)

NameError: name 'sys' is not defined

In [None]:
# Length of file
len(df_txt)

## Tokens per vector

In [None]:
# Check tokens count per vector
df_tokens_check = df_txt.copy()

# Apply function
df_tokens_check['token_count'] = df_txt['text'].apply(count_tokens)

# Sort
df_tokens_check = df_tokens_check.sort_values(by='token_count', ascending=False)

# Filter
cols = ['id', 'url', 'title', 'date', 'text_id', 'text', 'token_count']
df_tokens_check[cols]

# Show
df_tokens_check.head()

In [None]:
# Calculate average token count
average_token_count = df_tokens_check['token_count'].mean()

# Create a larger figure
plt.figure(figsize=(10, 5))

# Create histogram
plt.hist(df_tokens_check['token_count'], bins=20, color='darkblue', edgecolor='black')

# Add vertical line for average token count
plt.axvline(average_token_count, color='red', linestyle='dashed', linewidth=1)

# Add labels and title
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Histogram of Token Count')

# Add legend for the vertical line
plt.legend(['Average Token Count'])

# Show histogram
plt.show()

# Pinecone

In [None]:
# Init pinecone
pinecone = Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

# Connect
index_name = 'lawgpt-unstructured-db'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

# Embedding model

In [None]:
# Model ID
embed_model_id = config["embedding_model"]

# Show
embed_model_id

In [None]:
# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
)

# Show
embed_model

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Delete existing data

In [None]:
# Delete from index
index.delete(delete_all = True)

In [None]:
# Stats
index.describe_index_stats()

# Insert to DB

In [None]:
# Define batch size
batch_size = config['batch_size']

In [None]:
# Embed and insert loop
for i in tqdm(range(0, len(df_txt), batch_size), desc='Processing batches'):
    
    # Batch processing
    i_end = min(len(df_txt), i + batch_size)
    batch = df_txt.iloc[i:i_end]
    
    # Convert potential float values to string in text-related columns
    batch['id'] = batch['id'].astype(str)
    batch['url'] = batch['url'].astype(str)
    batch['title'] = batch['title'].astype(str)
    batch['date'] = batch['date'].astype(str)
    batch['legislative_origin'] = batch['legislative_origin'].astype(str)
    batch['department'] = batch['department'].astype(str)
    batch['rang'] = batch['rang'].astype(str)
    batch['text_id'] = batch['text_id'].astype(str)
    batch['text'] = batch['text'].astype(str)
    
    # Text Ids formatting
    text_ids = batch['text_id'].tolist()
    texts = batch['text'].tolist()
    
    # Embed texts
    embeds = embed_model.embed_documents(texts)
    
    # Get metadata to store in DB
    metadata = [
        {'id': str(x['id']),
         'url': str(x['url']),
         'title': str(x['title']),
         'date': str(x['date']),
         'legislative_origin': str(x['legislative_origin']),
         'department': str(x['department']),
         'rang': str(x['rang']),
         'text_id': str(x['text_id']),
         'text': str(x['text'])
        } for _, x in batch.iterrows()
    ]
    
    # Add to Pinecone
    data_to_upsert = zip(text_ids, embeds, metadata)
    
    # Upsert to Pinecone
    index.upsert(data_to_upsert)

In [None]:
# Stats
index.describe_index_stats()

# Clean

In [None]:

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

# Runtime

In [None]:
# End time of notebook run
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to hours and minutes
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)

# Print the result
print(f"Time elapsed: {hours} hours and {minutes} minutes.")