# 0. Imports

In [1]:
import re
import pandas as pd
import os

from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import langchain
from langchain_chroma import Chroma
import chromadb
from sentence_transformers import CrossEncoder



from langchain_huggingface import HuggingFaceEmbeddings
import itertools


from langchain.memory import ConversationBufferMemory
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter



from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker


from evaluate import load


import torch
print(torch.cuda.is_available())
pd.set_option("display.max_rows", 100)


main_path = os.path.dirname(os.getcwd())

  from tqdm.autonotebook import tqdm, trange


True


## 0.1. Functions

In [2]:
def limpar_texto(texto):
    # Transformar em minúsculas
    texto = texto.lower()
    # Remover caracteres específicos
    texto = texto.replace("(", "").replace(")", "").replace(",", "").replace("\n", " ").replace("\xa0"," ")
    texto = re.sub(r"\. ", " ", texto)
    texto = re.sub(r"\! ", " ", texto)
    texto = re.sub(r"\" ", " ", texto)
    texto = re.sub(r'"(\w+)', r'\1', texto)  # Remove apenas a aspas antes da palavra
    texto = re.sub(r"\: ", " ", texto)
    texto = texto.replace("  ", " ")
    texto = texto.replace(" - ", " ")
    texto = re.sub(r"'s\b", "", texto)  # Remove apenas 's no final de palavras
    
    return texto

def limpar_texto_testset(texto):
    # Transformar em minúsculas
    texto = texto.lower()
    # Remover caracteres específicos
    texto = texto.replace("(", "").replace(")", "").replace(",", "").replace("\n", " ").replace("\xa0"," ")
    texto = re.sub(r"\. ", " ", texto)
    texto = re.sub(r"\! ", " ", texto)
    texto = re.sub(r"\" ", " ", texto)
    texto = re.sub(r'"(\w+)', r'\1', texto)  # Remove apenas a aspas antes da palavra
    texto = re.sub(r"\: ", " ", texto)
    texto = texto.replace("  ", " ")
    texto = texto.replace(" - ", " ")
    texto = re.sub(r"'s\b", "", texto)  # Remove apenas 's no final de palavras

    texto = re.sub(r"\?", " ", texto)

    return texto

def BM25TextPreparation(data,Series_metadata, chunk_size):
    text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,  # Max number of characters per chunk
    # chunk_overlap=chunk_overlap  # Overlap between chunks for better context retention
    )

    # Step 2: Prepare an empty list to store documents with their associated metadata
    split_docs = []

    # Step 3: Loop through each record in `data["Final"]` and split it, keeping the source ("Fonte")
    for i, final_text in enumerate(data):
        # Split the text into chunks
        docs = text_splitter.create_documents([final_text])
        
        # Add the "Fonte" to the metadata of each document
        for doc in docs:
        
            doc.metadata = {"Source":Series_metadata[i]}  # Associate each chunk with the correct source
            split_docs.append(doc)  # Store the document with the metadata

    texts = [str(doc.page_content) for doc in split_docs]

    return split_docs



# 1. Data

## 1.1 Dataset

In [3]:
# need to import the Corpus again to create the sparse vector for BM25Retriever
df_db = pd.read_excel(main_path+"\\2_Data\\1_Final\\Dataset_Final.xlsx").drop("Unnamed: 0", axis=1)
# Clean text in order to have exactly the same text that is stored in the vector databases. Otherwise the Reciprocal Ranking Fusion formula will not work.
df_db["Texto_lower"] = [limpar_texto(texto) for texto in df_db["Texto"].to_list()]
# list with the documents that compõe o Corpus with 7000 chunk-size
texts_3000 = BM25TextPreparation(df_db["Texto_lower"].to_list(),df_db["Source"],3000)

## 1.2. Test Set

In [4]:
df_test = pd.read_excel(main_path+"\\2_Data\\3_TestSet\\FAQ.xlsx",sheet_name="Question-Context-Answer")
df_test['Context'] = df_test['Context'].str.replace(r'\\n', '\n', regex=True)
df_test = df_test.groupby('Question')[['Answer']].agg("first").reset_index()
df_test["Question_lower"] = [limpar_texto_testset(texto) for texto in df_test["Question"].to_list()]


# 2. Batch

In [5]:
RAG_Batch = {
    "Vector Store": "DB_Porto_Final_alibaba_embeddings_treated_3000",
    "Top-N" : [50],
    "Top-k": 8,
    "Embeddings_model" : "Alibaba-NLP/gte-large-en-v1.5",
    "Generator model" : ["llama3.2:3b","llama3.1:8b"]
}

# 3. Vector Database

In [6]:
model_kwargs = {'device': 'cuda',"trust_remote_code":True}

embeddings = HuggingFaceEmbeddings(
    model_name=RAG_Batch["Embeddings_model"],
    model_kwargs=model_kwargs,
)


vector_store = Chroma(collection_name=RAG_Batch["Vector Store"],
                      persist_directory=main_path+"\\2_Data\\2_DBs\\",
                      embedding_function=embeddings)

# 4. Chatbot

In [None]:
bm25_retriever = BM25Retriever.from_documents(texts_3000)
bm25_retriever.k = RAG_Batch["Top-N"][0]
retriever = vector_store.as_retriever(search_kwargs={"k": RAG_Batch["Top-N"][0]})

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5])
CrossEncoder_model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", model_kwargs = {'device': 'cuda'})
compressor = CrossEncoderReranker(model=CrossEncoder_model, top_n=RAG_Batch["Top-k"])
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=ensemble_retriever)

def Chatbot(model, query):  
    system_prompt = (
    """
    System: This is a Chatbot that only answers to questions related to Porto (Portugal) Tourism. More specifically, to topics related to attractions, accessibility, amenities, activities, available packages, and Ancillary Services. 
    When not specified by the user assume the question is related to Porto.
    If the question is not about Porto Tourism just write: "I am sorry, but my knowledge only allows me to help you with Porto Tourism topics. Can I help you with something related to Porto Tourism?"
    
    Answer to the user's question objectively, using correct syntax and based on context written below: 
    {context}\n

    User: {input}
    """
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("system", "{context}"),
            ("human", "{input}"),
        ]
    )


    generator_model = ChatOllama(
                    model=model,
                    temperature=0,
                    num_gpu=1,
                    num_ctx=50000
                )
        
        
    # Chain que monta o prompt para passar no modelo
    question_answer_chain = create_stuff_documents_chain(generator_model, prompt)
    
    # chain que pega no contexto e adiciona ao pompt para passar no modelo
    rag_chain = create_retrieval_chain(compression_retriever, question_answer_chain)
    
    response = rag_chain.invoke(
    {"input":query})

    return response["answer"]

In [8]:
Chatbot(RAG_Batch["Generator model"][0], "is porto people friendly")

"Yes, Porto is a very people-friendly city. The locals are known for their warm hospitality and welcoming nature, making visitors feel at ease and comfortable in the city.\n\nPortuguese culture places a strong emphasis on social interaction and community, which contributes to the city's friendly atmosphere. You'll often find that locals are eager to chat with tourists and share recommendations or advice about the best places to visit or eat.\n\nAdditionally, Porto has a lively downtown area with many cafes, restaurants, and bars where you can meet new people and make friends. The city also hosts various festivals and events throughout the year, which bring together locals and visitors alike.\n\nSome of the safest and most beautiful neighborhoods to visit in Porto are Foz do Douro, Boavista, Matosinhos, and Cedofeita, which offer a great balance of charm, culture, and people-watching opportunities."

# 5. GridSearch

In [9]:
bertscore = load("bertscore")
SAS = CrossEncoder('cross-encoder/stsb-roberta-large')

resultados = {
    "Generator Model": [],
    "Question" : [],
    "Reference" : [],
    "Prediction" : [],
    "Recall": [],
    "Precision" : [],
    "F1-Score" : [],
    "SAS" : []
}
i=1
for llm in RAG_Batch["Generator model"]:
    for question,answer in zip(df_test["Question_lower"],df_test["Answer"]):
        print(i,question)
        i=i+1
        question_cleand = limpar_texto(question)
        prediction = Chatbot(
                            model=llm,
                            query=question)

        results = bertscore.compute(predictions=[prediction], references=[answer], model_type="distilbert-base-uncased")
        sas_score = SAS.predict([(answer, prediction)])


        resultados["Generator Model"].append(llm)
        resultados["Question"].append(question)
        resultados["Reference"].append(answer)
        resultados["Prediction"].append(prediction)
        resultados["Recall"].append(round(results["recall"][0],2))
        resultados["Precision"].append(round(results["precision"][0],2))
        resultados["F1-Score"].append(round(results["f1"][0],2))
        resultados["SAS"].append(round(sas_score[0],2))


##### Testes aos Modelos Baseline
print("baseline models started")

llamma3B_model = ChatOllama(
    model=RAG_Batch["Generator model"][0],
    temperature=0,
    num_gpu=1,
    num_ctx=50000
)

llam8b_model = ChatOllama(
    model=RAG_Batch["Generator model"][1],
    temperature=0,
    num_gpu=1,
    num_ctx=50000
)

i=1
for question,answer in zip(df_test["Question"],df_test["Answer"]):
    i=i+1
    print(i,question)
    # Llama3b model baseline
    prediction = llamma3B_model.invoke(question).content

    results = bertscore.compute(predictions=[prediction], references=[answer], model_type="distilbert-base-uncased")
    sas_score = SAS.predict([(answer, prediction)])
        
    resultados["Generator Model"].append("llama3.2-3B - baseline")
    resultados["Question"].append(question)
    resultados["Reference"].append(answer)
    resultados["Prediction"].append(prediction)
    resultados["Recall"].append(round(results["recall"][0],2))
    resultados["Precision"].append(round(results["precision"][0],2))
    resultados["F1-Score"].append(round(results["f1"][0],2))
    resultados["SAS"].append(round(sas_score[0],2))

    # Llama8b model baseline
    prediction = llam8b_model.invoke(question).content
    results = bertscore.compute(predictions=[prediction], references=[answer], model_type="distilbert-base-uncased")
    sas_score = SAS.predict([(answer, prediction)])
        
    resultados["Generator Model"].append("Llama3.1-8B - baseline")
    resultados["Question"].append(question)
    resultados["Reference"].append(answer)
    resultados["Prediction"].append(prediction)
    resultados["Recall"].append(round(results["recall"][0],2))
    resultados["Precision"].append(round(results["precision"][0],2))
    resultados["F1-Score"].append(round(results["f1"][0],2))
    resultados["SAS"].append(round(sas_score[0],2))
    

df_resultados = pd.DataFrame(resultados)
df_resultados.head()

1 are there any festivals or events happening in porto 
2 are there any guided tours available in porto 
3 best wineries to visit in vila nova de gaia
4 can you recommend any traditional portuguese restaurants in porto 
5 can you recommend any wine tours in porto 
6 give me suggestions of restaurants in matosinhos i heard it has good fish
7 how much is a metro ticket more or less 
8 how much is the price of a taxi from the airport to the city centre 
9 i am going to porto in june is there any main events happening there during this time 
10 i want to have lunch in a portuguese restaurant with a view to the river or to the sea what do you suggest 
11 is porto card worth it or is not necessary 
12 is porto a walkable city 
13 is porto safe  are there any places that is not recommended to go 
14 is são bento train station a nice place to visit 
15 is ibis hotel in lisbon near the city center 
16 is porto people friendly 
17 is there any museum to learn about porto wine 
18 tell me about a

Unnamed: 0,Generator Model,Question,Reference,Prediction,Recall,Precision,F1-Score,SAS
0,llama3.2:3b,are there any festivals or events happening in...,"Yes, Porto has events that happen every year s...","Yes, Porto has a vibrant festival scene throug...",0.81,0.73,0.77,0.48
1,llama3.2:3b,are there any guided tours available in porto,"Yes, there are guided tours through Porto. The...","Yes, there are many guided tours available in ...",0.92,0.87,0.89,0.77
2,llama3.2:3b,best wineries to visit in vila nova de gaia,"Ferreira, Sandeman, Grahams and Cálem Cellars ...",Vila Nova de Gaia is a must-visit destination ...,0.88,0.71,0.79,0.6
3,llama3.2:3b,can you recommend any traditional portuguese r...,Porto's most traditional restaurants are: Cas...,Some of the most traditional Portuguese restau...,0.92,0.76,0.83,0.54
4,llama3.2:3b,can you recommend any wine tours in porto,"Yes, some wines tours that I can recommedn are...",The best porto guided tours that you can exper...,0.79,0.72,0.75,0.54


In [10]:
df_resultados.to_excel(main_path+"\\3_Results\\1_Tabelas\\2_Resultados_RAGModel.xlsx")

pivot_table = df_resultados.pivot_table(
    index=["Generator Model"],
    values=["F1-Score","Precision","Recall","SAS"],
    aggfunc="mean"
)

styled_pivot_table = (
    pivot_table.style
    .format({"F1-Score": "{:.2f}", "Precision": "{:.2f}", "Recall": "{:.2f}", "SAS": "{:.2f}"})  # Arredondar a 2 casas decimais
    .set_table_attributes('style="text-align: center;"')  # Centralizar
)

styled_pivot_table.to_excel(main_path+"\\3_Results\\1_Tabelas\\3_Resultados_Final_Sumario.xlsx")

styled_pivot_table


Unnamed: 0_level_0,F1-Score,Precision,Recall,SAS
Generator Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Llama3.1-8B - baseline,0.72,0.66,0.79,0.52
llama3.1:8b,0.82,0.77,0.87,0.64
llama3.2-3B - baseline,0.72,0.66,0.78,0.54
llama3.2:3b,0.8,0.76,0.86,0.61
