# 0. Imports

In [1]:
import os
import re

import langchain
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
import chromadb
from langchain_openai import AzureOpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

import pandas as pd
pd.set_option("display.max_rows", 100)
import itertools
from collections import Counter
import string
import torch
print(torch.cuda.is_available())

main_path = os.path.dirname(os.getcwd())

True


## 0.1. Batch

In [2]:
Batch_retriever = {
    "Chunk-size": [3000,7000],
    "Embeddings model" : ["Alibaba-NLP/gte-large-en-v1.5"], 
}

## 0.2. Data

In [3]:
df_db = pd.read_excel(main_path+"\\2_Data\\1_Final\\Dataset_Final.xlsx").drop("Unnamed: 0", axis=1)

In [4]:
todos_os_textos = " ".join(df_db['Texto'])
pontuacoes = [char for char in todos_os_textos if char in string.punctuation]
contador_pontuacoes = Counter(pontuacoes)
df_pontuacoes = pd.DataFrame.from_dict(contador_pontuacoes, orient='index', columns=['Frequency'])
df_pontuacoes.reset_index(inplace=True)
df_pontuacoes.columns = ['Punctuation', 'Frequency']
df_pontuacoes.sort_values(by="Frequency",ascending=False).reset_index(drop=True)

Unnamed: 0,Punctuation,Frequency
0,.,116839
1,",",91281
2,-,77779
3,:,58874
4,/,26480
5,_,26147
6,=,18330
7,&,11730
8,+,3740
9,?,2976


In [5]:
def limpar_texto(texto):
    # Transformar em minúsculas
    texto = texto.lower()
    # Remover caracteres específicos
    texto = texto.replace("(", "").replace(")", "").replace(",", "").replace("\n", " ").replace("\xa0"," ")
    texto = re.sub(r"\. ", " ", texto)
    texto = re.sub(r"\! ", " ", texto)
    texto = re.sub(r"\" ", " ", texto)
    texto = re.sub(r'"(\w+)', r'\1', texto)  # Remove apenas a aspas antes da palavra
    texto = re.sub(r"\: ", " ", texto)
    texto = texto.replace("  ", " ")
    texto = texto.replace(" - ", " ")
    texto = re.sub(r"'s\b", "", texto)  # Remove apenas 's no final de palavras
    
    return texto

# Aplicando a função na coluna "Texto"
df_db["Texto_lower"] = [limpar_texto(texto) for texto in df_db["Texto"].to_list()]

# 1. DB

In [6]:
def DB(db_name, df_page_doc,Series_metadata, chunk_size, embedding_model,path):

    text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,  # Max number of characters per chunk
    # chunk_overlap=chunk_overlap  # Overlap between chunks for better context retention
    )

    # Step 2: Prepare an empty list to store documents with their associated metadata
    split_docs = []

    # Step 3: Loop through each record in `data["Final"]` and split it, keeping the source ("Fonte")
    for i, final_text in enumerate(df_page_doc.to_list()):
        # Split the text into chunks
        docs = text_splitter.create_documents([final_text])
        
        # Add the "Fonte" to the metadata of each document
        for doc in docs:
        
            doc.metadata = {"Source":Series_metadata[i]}  # Associate each chunk with the correct source
            split_docs.append(doc)  # Store the document with the metadata

    texts = [str(doc.page_content) for doc in split_docs]
    metadata = [doc.metadata for doc in split_docs]
    ids = list(range(len(split_docs)))  # Assuming `text` is the input list
    ids = [str(i) for i in ids]   # Convert each element to string

    client = chromadb.PersistentClient(path=path)

    model_kwargs = {'device': 'cuda',"trust_remote_code":True}

    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model,
        model_kwargs=model_kwargs,
        
    )

    collection = client.get_or_create_collection(name=db_name,metadata={"hnsw:space": "cosine"}) 
    print("db created")
    embeddings_final = [embeddings.embed_documents([str(x)])[0] for x in split_docs]
    print("embeddings finished")
    collection.add(
        documents=texts,
        embeddings=embeddings_final,
        metadatas=metadata,
        ids=ids)
    

    vector_store = Chroma(collection_name=db_name,
    persist_directory=path, embedding_function=embeddings)
    
    return vector_store

# 2. Create DB

In [7]:
for chunk_size in Batch_retriever["Chunk-size"]:
    print(chunk_size)
    vectordb_name = "DB_Porto_Final_alibaba_embeddings_"+str(chunk_size)
    DB(vectordb_name,
       df_db["Texto"],
       df_db["Source"],
       chunk_size,
       embedding_model=Batch_retriever["Embeddings model"][0],
       path=main_path+"\\2_Data\\2_DBs\\")
    
    vectordb_name = "DB_Porto_Final_alibaba_embeddings_treated_"+str(chunk_size)
    DB(vectordb_name,
       df_db["Texto_lower"],
       df_db["Source"],
       chunk_size,
       embedding_model=Batch_retriever["Embeddings model"][0],
       path=main_path+"\\2_Data\\2_DBs\\")
    

3000


  from tqdm.autonotebook import tqdm, trange


db created
embeddings finished
db created
embeddings finished
7000
db created
embeddings finished
db created
embeddings finished
