# Installs
Restart after installing

In [None]:
%%time

from IPython.display import clear_output

! pip install -qq langchain-huggingface
! pip install -qq langchain-community
#! pip install -qq langchain
! pip install -qq rouge_score
! pip install -qq bitsandbytes
#! pip install -qq accelerate
! pip install faiss-gpu-cu12
#! pip install -qq peft
#! pip install -qq torch
! pip install -qq evaluate
! pip install -qq trl

clear_output()

# Imports

In [1]:
%%time

from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time
import pandas as pd
from datasets import Dataset
import evaluate
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType

import langchain

### loaders
from langchain.document_loaders import DirectoryLoader, TextLoader

### splits
from langchain.text_splitter import SpacyTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    BitsAndBytesConfig, pipeline, GenerationConfig, TrainingArguments,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
)
from trl import SFTTrainer

clear_output()

CPU times: user 18.2 s, sys: 3.32 s, total: 21.5 s
Wall time: 28.7 s


# CFG

In [2]:
class CFG:
    # LLMs
    model_name = 'NousResearch/Llama-2-7b-chat-hf' # NousResearch/Llama-2-7b-chat-hf, google/flan-t5-base
    task_type = TaskType.CAUSAL_LM # TaskType.SEQ_2_SEQ_LM for flan-t5, TaskType.CAUSAL_LM for llama2
    fine_tune_with_LoRA = True
    lora_target_modules = ['up_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj']
    # Flan-T5 ["q", "v"], Llama ['up_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj']
    
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 300 # set 300 for Llama; 200 for flan-t5
    split_overlap = 0

    # similar passages
    k = 4
    
    # Vector Database Embedding
    embedding_model = 'sentence-transformers/all-mpnet-base-v2'
    
    # paths
    DOCs_path = '/kaggle/input/questionanswer-dataset/text_data/text_data'
    Output_folder = './rag-vectordb'

# Preprocess Data

In [3]:
df_S08 = pd.read_csv('/kaggle/input/questionanswer-dataset/S08_question_answer_pairs.txt', sep='\t')
df_S08.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4


In [4]:
print(f"Before removing NULL values: {df_S08.shape}")

df_S08 = df_S08.dropna()

print(f"After removing NULL values: {df_S08.shape}")

Before removing NULL values: (1715, 6)
After removing NULL values: (1148, 6)


In [5]:
df_S08 = df_S08.drop_duplicates(subset=['Question'])

print(f"After removing duplicates: {df_S08.shape}")

After removing duplicates: (602, 6)


In [6]:
df_S08['Question'] = df_S08.apply(
    lambda x: x['Question'] if all(word in x['Question'] for word in x['ArticleTitle'].replace('_', ' ').split()) 
    else x['ArticleTitle'].replace('_', ' ') + ". " + x['Question'], 
    axis=1
)

# Loader

In [7]:
loader = DirectoryLoader(
    CFG.DOCs_path,
    glob="S08*.txt.clean",
    loader_cls=TextLoader,
    show_progress=True,
    use_multithreading=True,
    loader_kwargs={"encoding": "ISO-8859-1"}
)

documents = loader.load()

100%|██████████| 40/40 [00:00<00:00, 1011.42it/s]


In [8]:
print(f'We have {len(documents)} pages in total')

We have 40 pages in total


In [9]:
print(documents[0].page_content[:600])

polar bear



The polar bear (Ursus maritimus) is a bear native to the Arctic. Polar bears and Kodiak bears are the world's largest land carnivores, with most adult males weighing 300-600 kg (660-1320 lb); adult females are about half the size of males. Its fur is hollow and translucent, but usually appears as white or cream colored, thus providing the animal with effective camouflage.  Its skin is actually black in color. Its thick blubber and fur insulate it against the cold. The bear has a short tail and small ears that help reduce heat loss, as well as a relatively small head and long, tap


In [10]:
# Function to clean text
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Clean each document
for doc in documents:
    doc.page_content = clean_text(doc.page_content)

print(documents[0].page_content[:600])

polar bear The polar bear (Ursus maritimus) is a bear native to the Arctic. Polar bears and Kodiak bears are the world's largest land carnivores, with most adult males weighing 300-600 kg (660-1320 lb); adult females are about half the size of males. Its fur is hollow and translucent, but usually appears as white or cream colored, thus providing the animal with effective camouflage. Its skin is actually black in color. Its thick blubber and fur insulate it against the cold. The bear has a short tail and small ears that help reduce heat loss, as well as a relatively small head and long, tapered


# Splitter

In [11]:
text_splitter = SpacyTextSplitter(
    chunk_size=CFG.split_chunk_size,
    chunk_overlap=CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 5384 chunks from 40 pages


# Create Embeddings

In [12]:
%%time

vectordb = FAISS.from_documents(
    texts,
    HuggingFaceEmbeddings(model_name=CFG.embedding_model)
)

### persist vector database
vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag")
#vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_rag", HuggingFaceEmbeddings(model_name=CFG.embedding_model), allow_dangerous_deserialization=True)

clear_output()

CPU times: user 19 s, sys: 3.45 s, total: 22.4 s
Wall time: 23.9 s


# Define model

In [13]:
retriever = vectordb.as_retriever(search_kwargs={"k": CFG.k, "search_type": "similarity"})

# Initialize an empty list to store contexts
contexts = []

# Loop through each question and fetch its context
for question in tqdm(df_S08['Question'], desc="Fetching contexts"):

    results = retriever.invoke(question)
    
    # Extract page contents from results and join them as a single string
    context = " ".join([doc.page_content for doc in results])
    
    # Append the context to the list
    contexts.append(context)

# Add the contexts list as a new column to the dataframe
df_S08['Context'] = contexts

# Display the dataframe to verify
df_S08[['Question', 'Context']].head()

Fetching contexts: 100%|██████████| 602/602 [00:08<00:00, 75.14it/s]


Unnamed: 0,Question,Context
0,Was Abraham Lincoln the sixteenth President of...,"Abraham Lincoln Abraham Lincoln (February 12, ..."
2,Abraham Lincoln. Did Lincoln sign the National...,Also included was the creation of the system o...
4,Abraham Lincoln. Did his mother die of pneumonia?,"Lincoln was only nine when his mother, then th..."
6,Abraham Lincoln. How many long was Lincoln's f...,"While in New Orleans, he may have witnessed a ..."
8,Abraham Lincoln. When did Lincoln begin his po...,Young Abraham Lincoln Lincoln began his politi...


In [14]:
%%time

model_repo = CFG.model_name

device_map = "auto"
        
tokenizer = AutoTokenizer.from_pretrained(model_repo)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

if CFG.task_type == TaskType.CAUSAL_LM:

    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
    
    def base_model_init():
        model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config=bnb_config,
        device_map=device_map
        )
        model.config.use_cache = False
        model.enable_input_require_grads()

        return model
        
    base_model = base_model_init()

    #max_len = base_model.config.max_position_embeddings
    max_len = 2048

    tokenizer.model_max_length = max_len

elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:

    def base_model_init():
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_repo, 
            torch_dtype=torch.bfloat16, 
            device_map = device_map
        )
        return model
    
    base_model = base_model_init()
    
    max_len = base_model.config.n_positions

    tokenizer.model_max_length = max_len

clear_output()

CPU times: user 26.9 s, sys: 31 s, total: 57.9 s
Wall time: 1min 9s


# Prompt

In [15]:
prompt_template = """
Use only the following pieces of context to answer the question.

{context}

Question: {question}
Answer: """

if CFG.task_type == TaskType.CAUSAL_LM:
    # Add tags for Llama2
    prompt_template = f"[INST]\n{prompt_template.strip()} [/INST]"

PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

# Cross-validation

In [16]:
def tokenize_function_SEQ2SEQ(row):
    formatted_prompt = PROMPT.format(question=row['Question'], context=row['Context'])
    inputs = tokenizer(formatted_prompt, max_length=max_len, truncation=True)
    labels = tokenizer(row["Answer"], max_length=max_len, truncation=True)

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids
    }
    
def tokenize_function_CAUSAL_LM(row):
    formatted_prompt = PROMPT.format(question=row['Question'], context=row['Context'])
    combined_text = f"<s>{formatted_prompt} {row['Answer']} </s>"
    inputs = tokenizer(combined_text, max_length=max_len, truncation=True)
    
    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': inputs.input_ids
    }

In [17]:
hf_dataset = Dataset.from_pandas(df_S08)

if CFG.fine_tune_with_LoRA:
    
    if CFG.task_type == TaskType.CAUSAL_LM:
        hf_dataset = hf_dataset.map(
            tokenize_function_CAUSAL_LM,
        )
    
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        hf_dataset = hf_dataset.map(
            tokenize_function_SEQ2SEQ,
        )

train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Shapes of the datasets:")
print(f"Training: {len(train_dataset)} samples")
print(f"Validation: {len(eval_dataset)} samples")

Map:   0%|          | 0/602 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: 481 samples
Validation: 121 samples


# Perform PEFT with LoRA

In [18]:
%%time

peft_model = None

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=8,
    target_modules=CFG.lora_target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type=CFG.task_type
)

if CFG.fine_tune_with_LoRA:
    base_model = base_model_init()
    peft_model = get_peft_model(base_model, lora_config)

    clear_output()

CPU times: user 11.5 s, sys: 10.9 s, total: 22.4 s
Wall time: 22.3 s


In [19]:
if CFG.task_type == TaskType.CAUSAL_LM:
    batch_size = 1
    gradient_accumulation_steps = 4
    gradient_checkpointing = True
    fp16_full_eval = True,
    fp16 = True,
    max_grad_norm = 0.3
    optim = "paged_adamw_32bit"
    logging_steps = 60
    
elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
    batch_size = 8
    gradient_accumulation_steps = 1
    gradient_checkpointing = False
    fp16_full_eval = False
    fp16 = False
    max_grad_norm = 1.0
    optim = "adamw_torch"
    logging_steps = 15

common_args = {
    "per_device_train_batch_size": batch_size,
    "per_device_eval_batch_size": batch_size,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "gradient_checkpointing": gradient_checkpointing,
    "fp16_full_eval": fp16_full_eval,
    "fp16": fp16,
    "max_grad_norm": max_grad_norm,
    "num_train_epochs": 1,
    "learning_rate": 1e-3,
    "optim": optim,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.01,
    "logging_steps": logging_steps,
    "report_to": "none",
}

In [20]:
def create_peft_training_args(output_dir, common_args):
    """Helper function to create appropriate training arguments."""
    if CFG.task_type == TaskType.CAUSAL_LM:
        return TrainingArguments(**common_args, output_dir=output_dir)
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        return Seq2SeqTrainingArguments(**common_args, output_dir=output_dir)

def create_peft_trainer(args):
    """Helper function to create the appropriate trainer."""
    
    if CFG.task_type == TaskType.CAUSAL_LM:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False  # Causal LM doesn't use masked LM
        )
        return SFTTrainer(
            model=peft_model,
            args=args,
            peft_config=lora_config,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            processing_class=tokenizer,
            data_collator=data_collator
        )
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        data_collator = DataCollatorForSeq2Seq(tokenizer)
        return Seq2SeqTrainer(
            model=peft_model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            processing_class=tokenizer,
        )

In [21]:
if CFG.fine_tune_with_LoRA:
    output_dir = f'./peft-qa-training-{str(int(time.time()))}'

    peft_training_args = create_peft_training_args(output_dir, common_args)
    peft_trainer = create_peft_trainer(peft_training_args)
    peft_trainer.train()

Truncating train dataset:   0%|          | 0/481 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/121 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
60,1.5728
120,1.1891


# 🤗 Pipeline & Generation

In [22]:
generation_config = GenerationConfig(
    max_new_tokens=64,
    temperature=CFG.temperature,
    top_p=CFG.top_p,
    repetition_penalty=CFG.repetition_penalty,
)

if CFG.fine_tune_with_LoRA:
    peft_model.eval()
else:
    base_model.eval()

if CFG.task_type == TaskType.CAUSAL_LM:
    task = "text-generation"
elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
    task = "text2text-generation"

pipe = pipeline(
    task = task,
    model = peft_model if CFG.fine_tune_with_LoRA else base_model,
    tokenizer = tokenizer,
    device_map = device_map,
    generation_config = generation_config
)

llm = HuggingFacePipeline(pipeline = pipe)

clear_output()

In [23]:
def check_llm_response(dataset, indx):
    query = dataset[indx]['Question']
    context = dataset[indx]['Context']

    # Format the prompt using the question
    formatted_prompt = PROMPT.format(question=query, context=context)
    
    # Use the formatted prompt with the LLM
    llm_response = llm.invoke(formatted_prompt)
    
    if CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        llm_response = formatted_prompt + llm_response
    elif CFG.task_type == TaskType.CAUSAL_LM:
        llm_response = llm_response.replace("[INST]", "").replace("[/INST]", "").strip()
        llm_response = re.sub(r" +", " ", llm_response)

    print(llm_response)
    print(f"\nCorrect Answer: {dataset[indx]['Answer']}")

#check_llm_response(wrong_ans, 0) # use wrong_ans you find later to check better pipeline args
check_llm_response(eval_dataset, 2)

Use only the following pieces of context to answer the question.

The population density, , is among the lowest in the world.

The most densely populated part of the country is the Quebec City-Windsor Corridor along the Great Lakes and Saint Lawrence River in the southeast. A similar proportion live in urban areas concentrated in the Quebec City-Windsor Corridor (notably: the Greater Golden Horseshoe anchored around Toronto, Montreal, Ottawa, and their environs), the BC Lower Mainland (Vancouver and environs), and the Calgary-Edmonton Corridor in Alberta. Since the mid 1990s, Canada's federal government has posted annual budgetary surpluses and has steadily paid down the national debt.

Toronto, Ontario skyline with the CN tower.

Toronto is Canada's most populous metropolitan area with 5,113,149 people. The Great Lakes feed the St. Lawrence River (in the southeast) where lowlands host much of Canada's population.

Question: Where is the most densely populated part of Canada?
Answer: Q

# Retriever chain

In [24]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

# Post-process outputs

In [25]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + (' - page: ' + str(source.metadata['page']) if 'page' in source.metadata else '')
            + (f'\nContent: {source.page_content}' if CFG.task_type != TaskType.CAUSAL_LM else '')
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

def get_retrieval_data(llm_response):
    
    sources_used = ', '.join([
        source.metadata['source'].split('/')[-1] 
        for source in llm_response['source_documents']
    ])
    sources_used = sources_used.replace('.txt.clean', '')

    context_used = '\n'.join(
        [
            source.page_content
                for source in llm_response['source_documents']
        ]
    )

    return sources_used, context_used

In [26]:
def llm_ans(query):
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    if CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        ans = f"Question: {query}\nLLM Answer: " + ans
    elif CFG.task_type == TaskType.CAUSAL_LM:
        ans = ans.replace("[INST]", "").replace("[/INST]", "").strip()
        ans = re.sub(r" +", " ", ans)
    
    return ans

# Evaluations
- Check model on a single sample
- Calculate average score on validation dataset
- Check wrong answers

In [27]:
# Load the ROUGE metric
metric = evaluate.load("rouge")

def rouge_recall(correct_answer, prediction):
    answer_words = correct_answer.lower().split()
    context_words = set(prediction.lower().split())

    # Avoid division by zero
    if not answer_words:
        return 0.0

    match_count = sum(1 for word in answer_words if word in context_words)

    return match_count / len(answer_words)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Retrieval Evaluation

In [28]:
def evaluate_retrieval(dataset, indx):
    query = dataset[indx]['Question']
    correct_answer = dataset[indx]['Answer']
    article = dataset[indx]['ArticleFile']

    llm_response = qa_chain.invoke(query)
    pred_sources, pred_context = get_retrieval_data(llm_response)

    print(f"Predicted Article: {pred_sources}")
    print(f"Correct Article: {article}")

    print(f"\nPredicted Context: {pred_context}")
    print(f"Answer: {correct_answer}")

    score = rouge_recall(correct_answer, pred_context)
    print(f"\nROUGE Recall Scores: {score:.3f}")

evaluate_retrieval(eval_dataset, 4)

Predicted Article: S08_set3_a1, S08_set3_a1, S08_set3_a1, S08_set3_a1
Correct Article: S08_set3_a1

Predicted Context: During the presidential campaign of 1796 Adams was the presidential candidate of the Federalist Party and Thomas Pinckney, the Governor of South Carolina, his running mate.

The federalists wanted Adams as their presidential candidate to crush Thomas Jefferson's bid.
Adams' opponents were former Secretary of State Thomas Jefferson of Virginia, who was joined by Senator Aaron Burr of New York on the Democratic-Republican ticket.

As was customary, Adams stayed in his home town of Quincy rather than actively campaign for the Presidency.
Because of Adams's seniority and the need for a northern president, he was elected as the Federalist nominee for president in 1796, over Thomas Jefferson, the leader of the opposition Democratic-Republican Party.
Ferling (1992) ch 19; Ferling (2004)

In the election of 1800 John Adams and his running mate, Charles Cotesworth Pinckney went

In [29]:
def evaluate_all_retrievals(dataset):
    total_score = 0.0
    correct_article_matches = 0
    count_for_rouge = 0

    for i in tqdm(range(len(dataset)), desc="Evaluating"):
        query = dataset[i]['Question']
        correct_answer = dataset[i]['Answer']
        article = dataset[i]['ArticleFile']

        llm_response = qa_chain.invoke(query)
        pred_sources, pred_context = get_retrieval_data(llm_response)

        # Count matches
        pred_list = [
            src.strip() for src in pred_sources.split(',') if src.strip()
        ]
        if article in pred_list:
            correct_article_matches += 1

        # Only compute ROUGE recall if answer is not "yes" or "no"
        if not any(word in correct_answer.lower() for word in ("yes", "no")):
            score = rouge_recall(correct_answer, pred_context)
            total_score += score
            count_for_rouge += 1

    avg_score = total_score / count_for_rouge if count_for_rouge > 0 else 0
    match_accuracy = correct_article_matches / len(dataset)

    print(f"\nAverage ROUGE Recall Score (excluding yes/no): {avg_score:.3f}")
    print(f"Article Match Accuracy: {correct_article_matches}/{len(dataset)} ({match_accuracy:.2%})")

evaluate_all_retrievals(eval_dataset)

Evaluating:   7%|▋         | 8/121 [00:11<02:56,  1.57s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 121/121 [02:31<00:00,  1.25s/it]


Average ROUGE Recall Score (excluding yes/no): 0.630
Article Match Accuracy: 121/121 (100.00%)





## Answer Evaluation

In [30]:
def extract_prediction(llm_output):
    return llm_output.split("Answer:")[1].split("Sources:")[0].strip()

def evaluate_answer(dataset, indx):
    # Get the question and correct answer from the DataFrame
    query = dataset[indx]['Question']
    correct_answer = dataset[indx]['Answer']

    # Get the predicted answer from the language model
    pred_ans = llm_ans(query)

    print(pred_ans)
    print(f"\nCorrect Answer: {correct_answer}")

    # Compute ROUGE scores
    rouge_score = metric.compute(
        predictions=[extract_prediction(pred_ans)],
        references=[correct_answer],
        use_stemmer=True
    )

    print(f"\nROUGE Scores: {rouge_score}")

evaluate_answer(eval_dataset, 5)

Use only the following pieces of context to answer the question.

Retrieved 28 August 2007.

Kangaroos have few natural predators.

The Thylacine, considered by palaeontologists to have once been a major natural predator of the kangaroo, is now extinct.

Other extinct predators included the Marsupial Lion, Megalania and the Wonambi.

European settlers cut down forests to create vast grasslands for sheep and cattle grazing, added stock watering points in arid areas, and have substantially reduced the number of dingoes.

Kangaroos are shy and retiring by nature, and in normal circumstances present no threat to humans.

Along with dingos and other canids, introduced species like foxes and feral cats also pose a threat to kangaroo populations.

Kangaroos and wallabies are adept swimmers, and often flee into waterways if presented with the option.

Wedge-tailed Eagles and other raptors usually eat kangaroo carrion.

Goannas and other carnivorous reptiles also pose a danger to smaller kangar

In [31]:
# Define a function to make predictions
def predict(batch, llm_ans, metric):
    queries = batch['Question']
    
    # Get the predicted answers from the model for the entire batch
    pred_ans = [llm_ans(query) for query in queries]
    
    # Extract predictions from LLM output
    extracted_preds = [extract_prediction(ans) for ans in pred_ans]
    
    # Initialize lists to store ROUGE scores
    scores = []
    
    # Calculate ROUGE score for each prediction and store recalls
    for pred, ref in zip(extracted_preds, batch['Answer']):
        result = metric.compute(predictions=[pred], references=[ref])
        scores.append(result['rouge1'])
    
    # Return predictions, references, and low recall indices
    return {
        'prediction': extracted_preds, 
        'reference': batch['Answer'],
        'scores': scores
    }

# Apply the function to all rows in the dataset
predicted_dataset = eval_dataset.map(
    lambda batch: predict(batch, llm_ans=llm_ans, metric=metric),
    batched=True,
    batch_size=16,
    desc="Processing predictions"
)



Processing predictions:   0%|          | 0/121 [00:00<?, ? examples/s]

In [32]:
# Compute the ROUGE score for the entire dataset
rouge_score = metric.compute(
    predictions=predicted_dataset['prediction'],
    references=predicted_dataset['reference'],
    use_aggregator=True,
    use_stemmer=True
)

rr_scores = [
    rouge_recall(ref, pred)
    for ref, pred in zip(predicted_dataset['reference'], predicted_dataset['prediction'])
]
average_rouge_recall = sum(rr_scores) / len(rr_scores)

print(f"ROUGE Recall Score: {average_rouge_recall}")
print(f"ROUGE F1 Scores: {rouge_score}")

ROUGE Recall Score: 0.6525168038537025
ROUGE F1 Scores: {'rouge1': 0.7125967068612542, 'rouge2': 0.19958677685950416, 'rougeL': 0.7091875548635078, 'rougeLsum': 0.7135553558557723}


In [33]:
# Collect the low indices based on rouge1 < 0.5
low_rouge_indices = [i for i, rouge1 in enumerate(predicted_dataset['scores']) if rouge1 < 0.5]
wrong_ans = eval_dataset.select(low_rouge_indices)

# Display the filtered dataframe
percentage_wrong = round((len(wrong_ans) / len(eval_dataset)) * 100)
print(f"Number of wrong answers: {len(wrong_ans)} ({percentage_wrong}%)")

Number of wrong answers: 37 (31%)


In [34]:
evaluate_answer(wrong_ans, 0)

Use only the following pieces of context to answer the question.

Any spots on the flanks and limbs that have not merged into the mass of swirls and stripes are unusually small and discrete, rather than forming rosettes.

The face and underparts are paler and dappled like those of ordinary spotted leopards.

When making a threat, leopards stretch their backs, depress their ribcages between their shoulder blades so they stick out, and lower their heads (similar to domestic cats).

During the day they may lie in bush, on rocks, or in a tree with their tails hanging below the treetops and giving them away.

Head and body length is between 90 and 190 cm, the tail reaches 60 to 110cm.

Shoulder height is 45 to 80 cm.

Males are considerably larger than females and weigh 37 to 90 kg compared to 28 to 60 kg for females.

Ronald M. Nowak: Walker's Mammals of the World.

The leopard is an agile and graceful predator.

Although smaller than the other members of Panthera, the leopard is still abl

# Conclusions

- Things I found had the most impact on models output quality in my experiments:
    - Splitting: chunk size, overlap
    - Search: k
    - Pipeline parameters (temperature, top_p, penalty)
    - Embeddings function
    - Question with or without title