In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install -U langchain-community bitsandbytes chromadb datasets unidecode evaluate bert_score

In [2]:
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.chains import RetrievalQA
import os
from huggingface_hub import login
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datasets import load_dataset
from unidecode import unidecode
import re
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import utils as chromautils
from evaluate import load
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

Loading txt files using TextLoader from LangChain to create Document objects

In [4]:
def load_data(path_dir_data):
    loaded_docs = []
    for entry in os.scandir(path_dir_data):
        file_path = entry.path
        file_extension = file_path.split('.')[1]

        if file_extension == 'txt':
            loader = TextLoader(file_path)
            document = loader.load()
            loaded_docs.append(document)
    
    return loaded_docs

Calculating the average and the maximum length of a sentence to get an overview for the size of chunk size and overlap

In [5]:
def obtain_avg_max_len(path_dir_data):
    counter_sentences = 0
    sum_length_sentences = 0
    max_length_sentence = 0

    for entry in os.scandir(path_dir_data):
        file_path = entry.path
        file_extension = file_path.split('.')[1]

        if file_extension == 'txt':
            with open(file_path, 'r') as file:
                sentences = file.read().split('.')
                counter_sentences += len(sentences)
                sum_length_sentences += sum(len(sentence) for sentence in sentences)
                max_length = max(len(sentence) for sentence in sentences)

                if max_length > max_length_sentence:
                    max_length_sentence = max_length

    avg_length = sum_length_sentences // counter_sentences
    return max_length_sentence, avg_length

In [None]:
max_len, avg_len = obtain_avg_max_len('/kaggle/input/pentrurag')
max_len, avg_len

Splitting the text using RecursiveCharacterTextSplitter

In [7]:
def recursive_char_splitter(loaded_docs, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size,
      chunk_overlap = chunk_overlap
  )

    return [chunk for doc in loaded_docs for chunk in text_splitter.split_documents(doc)]

Query preprocessing (not that many techniques due to the fact that the texts come from a journalistic source)

In [8]:
class QueryPreprocessor:
    def __init__(self, query):
        self.query = query

    def lowercase_query(self):
        self.query = self.query.lower()

    def remove_diacritics_from_query(self):
        self.query = unidecode(self.query)

    def remove_special_characters(self):
        self.query = re.sub(r'[^ A-Za-z0-9/]+', '', self.query)

    def apply_all_techniques(self):
        self.lowercase_query()
        self.remove_diacritics_from_query()
        self.remove_special_characters()
        return self.query


Preparing to load the data for retrieval, the BASELINE was used for an initial experiment

In [None]:
loaded_docs = load_data('/kaggle/input/pentrurag')
ratio_overlap = int(0.2 * max_len)
split_documents_list = recursive_char_splitter(loaded_docs, 256, ratio_overlap)
# BASELINE: split_documents_list = recursive_char_splitter(loaded_docs, 50, 20)


Preparing the vector database and the embeddings, again the BASELINE was used for an initial experiment

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_database = Chroma.from_documents(documents=split_documents_list, embedding=embeddings)
retriever_model = vector_database.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.3})
# BASELINE: retriever_model = vector_database.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.6})

Plotting the document embeddings to get a better understanding of the data

In [None]:
def plot_embeddings(doc_embeddings):
    pca = PCA(n_components = 2)
    reduced_dim_embeddings = pca.fit_transform(doc_embeddings)

    plt.figure(figsize = (10, 7))
    plt.scatter(reduced_dim_embeddings[:, 0], reduced_dim_embeddings[:, 1], c='blue', alpha=0.7)
    plt.title('Embeddings visualization 2D')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True)
    plt.show()

doc_embeddings = vector_database._collection.get(include=['embeddings'])['embeddings']
plot_embeddings(doc_embeddings)

Preprocessing a manually defined golden standard

In [12]:
def preprocess_golden_standard(golden_standard):
    golden_standard_preprocessed = {}
    for query, answer in golden_standard.items():
        preprocessor = QueryPreprocessor(query)
        query_preprocessed = preprocessor.apply_all_techniques()
        golden_standard_preprocessed[query_preprocessed] = answer

    return golden_standard_preprocessed

Obtaining the relevant files from the retriever

In [13]:
def obtain_relevant_files(query_model):
    context = retriever_model.get_relevant_documents(query_model)
    list_files = []
    for item in context:
        context_file_source = item.metadata['source']
        if context_file_source not in list_files:
            list_files.append(context_file_source)

    return list_files

Evaluating the retriever with the precision recall and F1.

In [None]:
def calculate_precision_recall_for_retrieval(ground_truth, retrieved_files, query_model):
    true_positives = [file for file in retrieved_files if file in ground_truth]
    false_positives = [file for file in retrieved_files if file not in ground_truth]
    false_negatives = [file for file in ground_truth if file not in retrieved_files]

    if len(retrieved_files) == 0 or len(true_positives) == 0:
        return 0, 0, 0

    else:
        precision = len(true_positives) / (len(true_positives) + len(false_positives))
        recall = len(true_positives) / (len(true_positives) + len(false_negatives))
        f1 = 2*(precision * recall) / (precision + recall)
    
        return [precision, recall, f1]

ground_truth_files = {
    'Ce vârstă are Anamaria Federica Oana?' : ['/kaggle/input/pentrurag/file_3.txt'],
    'Cine este liderul campionatului de fotbal al Italiei?' : ['/kaggle/input/pentrurag/file_2 (1).txt'],
    'Cine este Lewis Hamilton?' : ['/kaggle/input/pentrurag/file_4.txt', '/kaggle/input/pentrurag/file_15.txt'],
    'Cine s-a confruntat pe Goodison Park?' : ['/kaggle/input/pentrurag/file_12.txt'],
    'Împotriva cui a debutat Rapid în noul an?' : ['/kaggle/input/pentrurag/file_14.txt'],
    'Cu ce notă a fost notat Andrei Rațiu de Sofascore?' : ['/kaggle/input/pentrurag/file_6.txt'],
    'Pe ce loc a terminat Unirea Urziceni Grupa G în sezonul 2009/10?' : ['/kaggle/input/pentrurag/file_9.txt'],
    'Cat va plăti Zenit St. Petersburg pentru Luiz Henrique?' : ['/kaggle/input/pentrurag/file_1 (1).txt'],
    'Câte victorii a strâns Tag Heuer din postura de sponsor principal?' : ['/kaggle/input/pentrurag/file_10.txt'],
    'Ce ar vrea PSG?' : ['/kaggle/input/pentrurag/file_5.txt'],
    'De ce spune Ralf Schumacher că Lewis Hamilton nu va avea impact la Ferrari?' : ['/kaggle/input/pentrurag/file_15.txt'],
    'Ce vârstă are Cahill, fostul antrenor al Simonei Halep?' : ['/kaggle/input/pentrurag/file_13.txt'],
    'Cum era pe vremuri jurnalismul?' : ['/kaggle/input/pentrurag/file_11.txt'],
    'Ce record a stabilit Andreas Almgren?' : ['/kaggle/input/pentrurag/file_7.txt'],
    'Ce naționalitate are Beatrice Chebet?' : ['/kaggle/input/pentrurag/file_8.txt'],
    'Spune-mi despre Radu Drăgușin' : ['/kaggle/input/pentrurag/file_12.txt', '/kaggle/input/pentrurag/file_5.txt']
}
sum_prec = 0
sum_recall = 0
sum_f1 = 0

# ground_truth_files = preprocess_golden_standard(ground_truth_files)


for query in ground_truth_files:
    relevant_docs = ground_truth_files[query]
    retrieved_docs = obtain_relevant_files(query)
    
    metrics = calculate_precision_recall_for_retrieval(relevant_docs, retrieved_docs, query)
    
    
    sum_prec += metrics[0]
    sum_recall += metrics[1]
    sum_f1 += metrics[2]
    
print(sum_prec / len(ground_truth_files))
print(sum_recall / len(ground_truth_files))
print(sum_f1 / len(ground_truth_files))

Defining the LLM

In [None]:
model_name = "OpenLLM-Ro/RoLlama2-7b-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Retrieval Augmented Generation

In [None]:
def rag_testing(query_model, temp, max_len):
    terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    context = retriever_model.get_relevant_documents(query_model)
    
    prompt_template = """
<|start_header_id|>user<|end_header_id|>
"Ești un asistent folositor, respectuos și onest. Te rog să răspunzi strict pe baza informațiilor oferite în Context. 
Dacă informațiile din Context nu sunt suficiente pentru a răspunde la întrebare sau răspunsul nu se află EXPLICIT în Context, spune clar acest lucru fără a inventa răspunsuri."
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
    prompt = prompt_template.format(question=query_model, context=context)

    pipeline_generated_answer = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=temp,
        do_sample=True,
        return_full_text=False,
        max_new_tokens=max_len,
        eos_token_id=terminators,
        top_p = 0.9
    )

    llm_answer = pipeline_generated_answer(prompt)[0]['generated_text'].strip()

    return llm_answer


In [18]:
golden_standard = {
    'Ce vârstă are Anamaria Federica Oana?' : '17 ani.',
    'Cine este liderul campionatului de fotbal al Italiei?' : 'Napoli este liderul campionatului de fotbal al Italiei.',
    'Cine este Lewis Hamilton?' : 'Lewis Hamilton este un pilot de curse britanic care a câștigat șapte campionate mondiale de Formula 1, fiind cel mai de succes pilot din istoria sportului. El este cunoscut pentru stilul său de conducere agresiv și pentru abilitatea sa de a câștiga curse în condiții dificile.',
    'Cine s-a confruntat pe Goodison Park?' : 'Everton și Tottenham s-au confruntat pe Goodison Park.',
    'Împotriva cui a debutat Rapid în noul an?' : 'Rapid debutează în noul an împotriva celor de la Poli Iași, luni seara, pe Giulești, de la 20:00.',
    'Cu ce notă a fost notat Andrei Rațiu de Sofascore?' : 'Andrei Rațiu a fost notat cu 7,3 de Sofascore',
    'Pe ce loc a terminat Unirea Urziceni Grupa G în sezonul 2009/10?' : 'În sezonul 2009/10, Unirea Urziceni a terminat Grupa G pe locul trei, cu opt puncte, deși adversare i-au fost FC Sevilla, VfB Stuttgart și Glasgow Rangers.',
    'Cat va plăti Zenit St. Petersburg pentru Luiz Henrique?' : 'Zenit St. Petersburg va plăti 35 de milioane de euro pentru Luiz Henrique',
    'Câte victorii a strâns Tag Heuer din postura de sponsor principal?' : 'Tag Heuer a strâns 230 de victorii',
    'Ce ar vrea PSG?' : 'PSG ar vrea un fundaș, mai ales că se va despărți de Milan Skriniar',
    'De ce spune Ralf Schumacher că Lewis Hamilton nu va avea impact la Ferrari?' : 'Ralf Schumacher spune că Lewis Hamilton nu va avea impactul așteptat la Ferrari, pentru că se află pe final de carieră.',
    'Ce vârstă are Cahill, fostul antrenor al Simonei Halep?' : 'Cahill are 59 de ani.',
    'Cum era pe vremuri jurnalismul?' : 'Pe vremuri, jurnalismul, inclusiv cel sportiv, a implicat să ai o conduită ireproșabilă în fața publicului',
    'Ce record a stabilit Andreas Almgren?' : 'Andreas Almgren, a stabilit un nou record european la 10 km, parcurgând distanţa în 26 de minute şi 53 de secunde.',
    'Ce naționalitate are Beatrice Chebet?' : 'Beatrice Chebet este de naționalitate kenyană.',
    'Spune-mi despre Radu Drăgușin' : 'Radu Drăgușin este un fotbalist român care joacă pentru echipa Tottenham Hotspur. În timpul unui meci, recent, jucătorul Calvert-Lewin a trimis o lovitură cu cotul lui Drăgușin, accidentându-l.'
    }



Generating answers with different temperatures or a fixed temperature to see the variety in the results obtained (BERTScoreF1) and the answers by the LLM

In [19]:
def max_bertscore_answers(golden_standard, temp, max_len):
    
    bertscore = load("bertscore")
    temp = [temp] if not isinstance(temp, list) else temp
    
    answers_max_f1 = []
    max_f1 = []
    prediction_scores_dictionary = {}
    
    for query_rag in golden_standard.keys():
        
        for t in temp:
            predicted = rag_testing(query_rag, t, max_len)
            
            reference = golden_standard[query_rag]
            
            result = bertscore.compute(predictions = [predicted], references = [reference], lang = "ro")
            if predicted not in prediction_scores_dictionary.keys():
                prediction_scores_dictionary[predicted] = result['f1'][0]

        answer_with_max_f1 = max(prediction_scores_dictionary, key = prediction_scores_dictionary.get)
        answers_max_f1.append(answer_with_max_f1)
        max_f1.append(prediction_scores_dictionary[answer_with_max_f1])
        prediction_scores_dictionary = {}

    return answers_max_f1, max_f1

Plotting the answers and the responses as a histogram to get a visual representation of the results

In [None]:
def plot_answers_histogram(answers, f1_scores, threshold):
    plt.figure(figsize=(8, 5))
    plt.hist(f1_scores, bins=10, color="skyblue", edgecolor="black", alpha=0.7)
    plt.axvline(x=threshold, color="red", linestyle="--", label=f"Threshold ({threshold})")

    plt.title("Histogram of BERTScore F1 for LLM Outputs", fontsize=14)
    plt.xlabel("BERTScore F1", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.legend()
    plt.grid(axis="y", linestyle="--", alpha=0.7)

    plt.tight_layout()
    plt.show()

answers_max_f1, max_f1 = max_bertscore_answers(golden_standard, [0.1], 128)
plot_answers_histogram(answers_max_f1, max_f1, 0.75)