# Installs
Restart after installing

In [None]:
%%time

from IPython.display import clear_output

! pip install -qq langchain-huggingface
! pip install -qq langchain-community
! pip install -qq langchain
! pip install -qq rouge_score
! pip install -qq bitsandbytes
! pip install -qq accelerate
! pip install -qq faiss-gpu
! pip install -qq peft
! pip install -qq torch
! pip install -qq evaluate
! pip install -qq trl

clear_output()

# Imports

In [1]:
%%time

from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

import os
import glob
import textwrap
import time
import pandas as pd
from datasets import Dataset
import evaluate
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType

import langchain

### loaders
from langchain.document_loaders import DirectoryLoader, TextLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM,
    BitsAndBytesConfig, pipeline, GenerationConfig, TrainingArguments,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
)
from trl import SFTTrainer

clear_output()

CPU times: user 7.69 s, sys: 1.05 s, total: 8.73 s
Wall time: 8.36 s


# CFG

In [2]:
class CFG:
    # LLMs
    model_name = 'NousResearch/Llama-2-7b-chat-hf' # NousResearch/Llama-2-7b-chat-hf, google/flan-t5-base
    task_type = TaskType.CAUSAL_LM # TaskType.SEQ_2_SEQ_LM for flan-t5, TaskType.CAUSAL_LM for llama2
    fine_tune_with_LoRA = True
    lora_target_modules = ['up_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj']
    # Flan-T5 ["q", "v"], Llama ['up_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj']
    
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 400
    split_overlap = 0

    # similar passages
    k = 2
    
    # Vector Database Embedding
    embedding_model = 'sentence-transformers/all-mpnet-base-v2'
    
    # paths
    DOCs_path = '/kaggle/input/questionanswer-dataset/text_data/text_data'
    Output_folder = './rag-vectordb'

# Preprocess Data

In [3]:
df_S08 = pd.read_csv('/kaggle/input/questionanswer-dataset/S08_question_answer_pairs.txt', sep='\t')
df_S08.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4


In [4]:
print(f"Before removing NULL values: {df_S08.shape}")

df_S08 = df_S08.dropna()

print(f"After removing NULL values: {df_S08.shape}")

Before removing NULL values: (1715, 6)
After removing NULL values: (1148, 6)


In [5]:
df_S08 = df_S08.drop_duplicates(subset=['Question'])

print(f"After removing duplicates: {df_S08.shape}")

After removing duplicates: (602, 6)


In [6]:
df_S08['Question'] = df_S08.apply(
    lambda x: x['Question'] if all(word in x['Question'] for word in x['ArticleTitle'].replace('_', ' ').split()) 
    else x['ArticleTitle'].replace('_', ' ') + ". " + x['Question'], 
    axis=1
)

# Loader

In [7]:
loader = DirectoryLoader(
    CFG.DOCs_path,
    glob="S08*.txt.clean",
    loader_cls=TextLoader,
    show_progress=True,
    use_multithreading=True,
    loader_kwargs={"encoding": "ISO-8859-1"}
)

documents = loader.load()

100%|██████████| 40/40 [00:00<00:00, 2663.22it/s]


In [8]:
print(f'We have {len(documents)} pages in total')

We have 40 pages in total


In [9]:
print(documents[0].page_content[:600])

otter



Otters are amphibious (or in one case aquatic) carnivorous mammals.  The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution.

An otter's den is called a holt.  Male otters are dog-otters, females are bitches and babies are cubs or pups.  The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature.




Otters have long, slim bodies and relatively short limbs, with webbed paws. Most h


In [10]:
# Function to clean text
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()  # Optional: Strip leading/trailing whitespace

# Clean each document
for doc in documents:
    doc.page_content = clean_text(doc.page_content)

print(documents[0].page_content[:600])

otter Otters are amphibious (or in one case aquatic) carnivorous mammals. The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution. An otter's den is called a holt. Male otters are dog-otters, females are bitches and babies are cubs or pups. The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature. Otters have long, slim bodies and relatively short limbs, with webbed paws. Most have sharp c


# Splitter

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CFG.split_chunk_size,
    chunk_overlap = CFG.split_overlap
)

texts = text_splitter.split_documents(documents)

print(f'We have created {len(texts)} chunks from {len(documents)} pages')

We have created 3162 chunks from 40 pages


# Create Embeddings

In [14]:
%%time

vectordb = FAISS.from_documents(
    texts,
    HuggingFaceEmbeddings(model_name=CFG.embedding_model)
)

### persist vector database
vectordb.save_local(f"{CFG.Output_folder}/faiss_index_rag")
#vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_rag", HuggingFaceEmbeddings(model_name=CFG.embedding_model), allow_dangerous_deserialization=True)

clear_output()

CPU times: user 22.8 s, sys: 219 ms, total: 23 s
Wall time: 21.5 s


In [15]:
retriever = vectordb.as_retriever(search_kwargs={"k": CFG.k, "search_type": "similarity"})

# Initialize an empty list to store contexts
contexts = []

# Loop through each question and fetch its context
for question in tqdm(df_S08['Question'], desc="Fetching contexts"):

    results = retriever.invoke(question)
    
    # Extract page contents from results and join them as a single string
    context = " ".join([doc.page_content for doc in results])
    
    # Append the context to the list
    contexts.append(context)

# Add the contexts list as a new column to the dataframe
df_S08['Context'] = contexts

# Display the dataframe to verify
df_S08[['Question', 'Context']].head()

Fetching contexts: 100%|██████████| 602/602 [00:09<00:00, 65.62it/s]


Unnamed: 0,Question,Context
0,Was Abraham Lincoln the sixteenth President of...,"Abraham Lincoln Abraham Lincoln (February 12, ..."
2,Abraham Lincoln. Did Lincoln sign the National...,"Transcontinental Railroad, which was completed..."
4,Abraham Lincoln. Did his mother die of pneumonia?,born. Theodore Roosevelt's mother Mittie died ...
6,Abraham Lincoln. How many long was Lincoln's f...,"a frequent visitor to Kentucky, he would have ..."
8,Abraham Lincoln. When did Lincoln begin his po...,"not like killing animals, even for food. Thoug..."


# Define model

In [16]:
%%time

model_repo = CFG.model_name

device_map = "auto"
        
tokenizer = AutoTokenizer.from_pretrained(model_repo)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

if CFG.task_type == TaskType.CAUSAL_LM:

    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
    
    def base_model_init():
        model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        quantization_config=bnb_config,
        device_map=device_map
        )
        model.config.use_cache = False
        model.enable_input_require_grads()

        return model
        
    base_model = base_model_init()

    #max_len = base_model.config.max_position_embeddings
    max_len = 1024

    tokenizer.model_max_length = max_len

elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:

    def base_model_init():
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_repo, 
            torch_dtype=torch.bfloat16, 
            device_map = device_map
        )
        return model
    
    base_model = base_model_init()
    
    max_len = base_model.config.n_positions

    tokenizer.model_max_length = max_len

clear_output()

CPU times: user 18.9 s, sys: 22.8 s, total: 41.7 s
Wall time: 1min 14s


# Prompt

In [21]:
prompt_template = """
Use only the following pieces of context to answer the question.

{context}

Question: {question}
Answer: """

if CFG.task_type == TaskType.CAUSAL_LM:
    # Add tags for Llama2
    prompt_template = f"[INST]\n{prompt_template.strip()} [/INST]"

PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

# Cross-validation

In [30]:
def tokenize_function_SEQ2SEQ(row):
    formatted_prompt = PROMPT.format(question=row['Question'], context=row['Context'])
    inputs = tokenizer(formatted_prompt, max_length=max_len, truncation=True)
    labels = tokenizer(row["Answer"], max_length=max_len, truncation=True)

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids
    }
    
def tokenize_function_CAUSAL_LM(row):
    formatted_prompt = PROMPT.format(question=row['Question'], context=row['Context'])
    combined_text = f"<s>{formatted_prompt} {row['Answer']} </s>"
    inputs = tokenizer(combined_text, max_length=max_len, truncation=True)
    
    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': inputs.input_ids
    }

In [31]:
hf_dataset = Dataset.from_pandas(df_S08)

if CFG.fine_tune_with_LoRA:
    
    if CFG.task_type == TaskType.CAUSAL_LM:
        hf_dataset = hf_dataset.map(
            tokenize_function_CAUSAL_LM,
        )
    
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        hf_dataset = hf_dataset.map(
            tokenize_function_SEQ2SEQ,
        )

train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Shapes of the datasets:")
print(f"Training: {len(train_dataset)} samples")
print(f"Validation: {len(eval_dataset)} samples")

Map:   0%|          | 0/602 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: 481 samples
Validation: 121 samples


# Perform PEFT with LoRA

In [32]:
%%time

peft_model = None

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=CFG.lora_target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type=CFG.task_type
)

if CFG.fine_tune_with_LoRA:
    base_model = base_model_init()
    peft_model = get_peft_model(base_model, lora_config)

    clear_output()

CPU times: user 5.7 s, sys: 1.29 s, total: 6.99 s
Wall time: 6.75 s


In [33]:
if CFG.task_type == TaskType.CAUSAL_LM:
    batch_size = 1
    gradient_accumulation_steps = 4
    gradient_checkpointing = True
    fp16_full_eval = True,
    fp16 = True,
    max_grad_norm = 0.3
    optim = "paged_adamw_32bit"
    logging_steps = 60
    
elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
    batch_size = 8
    gradient_accumulation_steps = 1
    gradient_checkpointing = False
    fp16_full_eval = False
    fp16 = False
    max_grad_norm = 1.0
    optim = "adamw_torch"
    logging_steps = 15

common_args = {
    "per_device_train_batch_size": batch_size,
    "per_device_eval_batch_size": batch_size,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "gradient_checkpointing": gradient_checkpointing,
    "fp16_full_eval": fp16_full_eval,
    "fp16": fp16,
    "max_grad_norm": max_grad_norm,
    "num_train_epochs": 1,
    "learning_rate": 1e-3,
    "optim": optim,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.01,
    "evaluation_strategy": "steps",
    "logging_steps": logging_steps,
    "report_to": "none",
}

In [34]:
def create_peft_training_args(output_dir, common_args):
    """Helper function to create appropriate training arguments."""
    if CFG.task_type == TaskType.CAUSAL_LM:
        return TrainingArguments(**common_args, output_dir=output_dir)
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        return Seq2SeqTrainingArguments(**common_args, output_dir=output_dir)

def create_peft_trainer(args):
    """Helper function to create the appropriate trainer."""
    
    if CFG.task_type == TaskType.CAUSAL_LM:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False  # Causal LM doesn't use masked LM
        )
        return SFTTrainer(
            model=peft_model,
            args=args,
            peft_config=lora_config,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            processing_class=tokenizer,
            data_collator=data_collator
        )
    elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        data_collator = DataCollatorForSeq2Seq(tokenizer)
        return Seq2SeqTrainer(
            model=peft_model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            processing_class=tokenizer,
        )

In [35]:
if CFG.fine_tune_with_LoRA:
    output_dir = f'./peft-qa-training-{str(int(time.time()))}'

    peft_training_args = create_peft_training_args(output_dir, common_args)
    peft_trainer = create_peft_trainer(peft_training_args)
    peft_trainer.train()

Step,Training Loss,Validation Loss
60,1.7174,1.423171
120,1.2655,1.251027


# 🤗 Pipeline & Generation

In [36]:
generation_config = GenerationConfig(
    max_new_tokens=64,
    temperature=CFG.temperature,
    top_p=CFG.top_p,
    repetition_penalty=CFG.repetition_penalty,
)

if CFG.fine_tune_with_LoRA:
    peft_model.eval()
else:
    base_model.eval()

if CFG.task_type == TaskType.CAUSAL_LM:
    task = "text-generation"
elif CFG.task_type == TaskType.SEQ_2_SEQ_LM:
    task = "text2text-generation"

pipe = pipeline(
    task = task,
    model = peft_model if CFG.fine_tune_with_LoRA else base_model,
    tokenizer = tokenizer,
    device_map = device_map,
    generation_config = generation_config
)

llm = HuggingFacePipeline(pipeline = pipe)

clear_output()

In [39]:
def check_llm_response(dataset, indx):
    query = dataset[indx]['Question']
    context = dataset[indx]['Context']

    # Format the prompt using the question
    formatted_prompt = PROMPT.format(question=query, context=context)
    
    # Use the formatted prompt with the LLM
    llm_response = llm.invoke(formatted_prompt)
    
    if CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        llm_response = formatted_prompt + llm_response
    elif CFG.task_type == TaskType.CAUSAL_LM:
        llm_response = llm_response.replace("[INST]", "").replace("[/INST]", "").strip()
        llm_response = re.sub(r" +", " ", llm_response)

    print(llm_response)
    print(f"\nCorrect Answer: {dataset[indx]['Answer']}")

#check_llm_response(wrong_ans, 0) # use wrong_ans you find later to check better pipeline args
check_llm_response(eval_dataset, 0)

Use only the following pieces of context to answer the question.

tail reaches 60 to 110cm. Shoulder height is 45 to 80 cm. Males are considerably larger than females and weigh 37 to 90 kg compared to 28 to 60 kg for females. Ronald M. Nowak: Walker's Mammals of the World. Johns Hopkins University Press, 1999 ISBN 0-8018-5789-9 One of many spotted cats, a leopard may be mistaken for a cheetah or a jaguar. The leopard has rosettes rather than cheetah's simple puá¹á¸Ã¡rÄ«ka ("tiger", among other things), then borrowed into Greek. The leopard is an agile and graceful predator. Although smaller than the other members of Panthera, the leopard is still able to take large prey given a massive skull that well utilizes powerful jaw muscles. Its body is comparatively long for a cat and its legs are short. Head and body length is between 90 and 190 cm, the

Question: How long is a leopard's tail?
Answer: 60 to 110 cm

Correct Answer: 60 to 110cm


# Retriever chain

In [40]:
retriever = vectordb.as_retriever(search_kwargs = {"k": CFG.k, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

# Post-process outputs

In [41]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            + (' - page: ' + str(source.metadata['page']) if 'page' in source.metadata else '')
            + (f'\nContent: {source.page_content}' if CFG.fine_tune_with_LoRA else '')
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [42]:
def llm_ans(query):
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    if CFG.task_type == TaskType.SEQ_2_SEQ_LM:
        ans = f"Question: {query}\nLLM Answer: " + ans
    elif CFG.task_type == TaskType.CAUSAL_LM:
        ans = ans.replace("[INST]", "").replace("[/INST]", "").strip()
        ans = re.sub(r" +", " ", ans)
    
    return ans

# Evaluations
- Check model on a single sample
- Calculate average recall score on validation dataset
- Check wrong answers

In [43]:
# Load the ROUGE metric
metric = evaluate.load("rouge")

In [44]:
def extract_prediction(llm_output):
    return llm_output.split("Answer:")[1].split("Sources:")[0].strip()

def evaluate_answer(dataset, indx):
    # Get the question and correct answer from the DataFrame
    query = dataset[indx]['Question']
    correct_answer = dataset[indx]['Answer']

    # Get the predicted answer from the language model
    pred_ans = llm_ans(query)

    print(pred_ans)
    print(f"\nCorrect Answer: {correct_answer}")

    # Compute ROUGE scores
    rouge_score = metric.compute(
        predictions=[extract_prediction(pred_ans)],
        references=[correct_answer],
        use_stemmer=True
    )

    print(f"\nROUGE Recall Scores: {rouge_score}")

evaluate_answer(eval_dataset, 6)

Use only the following pieces of context to answer the question.

republic, which could have risked the loss of the Southwest or dominance of the Northeast. Monroe placed faith in a strong presidency and the system of checks and balances. In the 1790s he fretted over an aging George Washington being too heavily influenced by close advisers like Hamilton who was too close to Britain. Monroe opposed the Jay Treaty and was humiliated when Washington criticized for

negotiate the Louisiana Purchase. Monroe was then appointed Minister to the Court of St. James (Britain) from 1803 to 1807. In 1806 he negotiated a treaty with Britain to replace the Jay Treaty of 1794, but Jefferson rejected it as unsatisfactory, as the treaty contained no ban on the British practice of impressment of American sailors. As a result, the two nations moved closer toward the War of

Question: James Monroe. What was the result of the rejection of the Jay Treaty?
Answer: The two nations moved closer toward war

Sour

In [45]:
# Define a function to make predictions
def predict(batch, llm_ans, metric):
    queries = batch['Question']
    
    # Get the predicted answers from the model for the entire batch
    pred_ans = [llm_ans(query) for query in queries]
    
    # Extract predictions from LLM output
    extracted_preds = [extract_prediction(ans) for ans in pred_ans]
    
    # Initialize lists to store ROUGE scores
    recalls = []
    
    # Calculate ROUGE score for each prediction and store recalls
    for pred, ref in zip(extracted_preds, batch['Answer']):
        result = metric.compute(predictions=[pred], references=[ref])
        recalls.append(result['rouge1'])
    
    # Return predictions, references, and low recall indices
    return {
        'prediction': extracted_preds, 
        'reference': batch['Answer'],
        'recalls': recalls
    }

# Apply the function to all rows in the dataset
predicted_dataset = eval_dataset.map(
    lambda batch: predict(batch, llm_ans=llm_ans, metric=metric),
    batched=True,
    batch_size=16,
    desc="Processing predictions"
)



Processing predictions:   0%|          | 0/121 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [46]:
# Compute the ROUGE score for the entire dataset
rouge_score = metric.compute(
    predictions=predicted_dataset['prediction'],
    references=predicted_dataset['reference'],
    use_aggregator=True,
    use_stemmer=True
)

print(f"ROUGE Recall Scores: {rouge_score}")

ROUGE Recall Scores: {'rouge1': 0.6879041979198757, 'rouge2': 0.18036733749235295, 'rougeL': 0.6873187776070315, 'rougeLsum': 0.6877324688225266}


In [47]:
# Collect the low recall indices based on recalls < 0.5
low_recall_indices = [i for i, recall in enumerate(predicted_dataset['recalls']) if recall < 0.5]
wrong_ans = eval_dataset.select(low_recall_indices)

# Display the filtered dataframe
percentage_wrong = round((len(wrong_ans) / len(eval_dataset)) * 100)
print(f"Number of wrong answers: {len(wrong_ans)} ({percentage_wrong}%)")

Number of wrong answers: 38 (31%)


In [50]:
evaluate_answer(wrong_ans, 2)

Use only the following pieces of context to answer the question.

James_Monroe James Monroe (April 28, 1758 â July 4, 1831) was the fifth President of the United States (1817-1825). His administration was marked by the acquisition of Florida (1819); the Missouri Compromise (1820), in which Missouri was declared a slave state; and the profession of the Monroe Doctrine (1823), declaring U.S. opposition to European interference in the Americas. The Presidentâs

founded by the American Colonization Society, in 1822, as a haven for freed slaves. * Monroe was (arguably) the last president to have fought in the Revolutionary War, although Andrew Jackson served as a 13-year-old courier in the Continental Army and was taken as a prisoner of war by the British. * Monroe is considered to be the president who was in the most paintings; throughout the 1800s he

Question: James Monroe. What is the first number on the page?
Answer: April

Sources: 
S08_set3_a2.txt.c
Content: James_Monroe James Mo

# Conclusions

- Things I found had the most impact on models output quality in my experiments:
    - Splitting: chunk size, overlap
    - Search: k
    - Pipeline parameters (temperature, top_p, penalty)
    - Embeddings function
    - Question with or without title