# Install crucial libraries

In [1]:
# !pip install langchain-community
# !pip install langchain
# !pip install pypdf
# !pip install faiss-cpu
# !pip install langchain-chroma

In [None]:
# !pip install bitsandbytes

# Imports

In [2]:
from langchain_community.document_loaders import PyPDFLoader
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from typing import Optional, List, Tuple
from langchain_chroma import Chroma
from transformers import pipeline
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
pd.set_option("display.max_colwidth", None)


## **Part 0: Choose and Test Your Topic Without a Knowledge Base**

Before you load any external documents, you must **verify that your chosen topic needs a knowledge base** to improve answers. This ensures your RAG system solves a real gap in the model’s knowledge.

###  **Steps:**

1. **Choose a Topic (Tentative)**

   * Pick a topic from 2024 or 2025 that you think is recent or under-documented.
   * Example topics:

     * A political decision (e.g., "European Union climate laws in 2024")
     * A cultural trend (e.g., "Music trends in early 2025")

2. **Formulate Question**

   * Write down one factual, clear question about the topic.
   * Aim for question that require up-to-date or specific knowledge.

3. **Query the Model Directly**

   * Use your LLM pipeline (without RAG) to ask this question.
   * Collect the model’s answer and evaluate their quality:

     * Are the answers incomplete?
     * Are they outdated?
     * Are they confident but wrong?
     * Do they say *"I don’t know"*?

---

Why This Matters:

This step ensures your RAG project is solving a **real information gap**, not just repeating what the model already knows.


In [3]:
# The topic chosen is: The Festival of Sanremo 2025

In [26]:
# The question chosen is:
question1 = "In 2025 which number edition of the Sanremo Festival was conducted?"
question2= "Who was the winner of the first night of the 2025 Sanremo Festival??"
question3= "Who was the presenter of the 2025 Sanremo Festival?"

# **Part 1: Load a Custom PDF Knowledge Base**

Find blog posts or wikipedia page with your topic and save information about it to a PDF file, and load it using `PyPDFLoader`. You may use other loaders not only pdf, but pdf loader is exactly the same as we used during lab.

- Find informative content on your topic (Wikipedia page, blog post, article, etc.)
- Save the page as a PDF file (you can use your browser’s print-to-PDF feature)

In [19]:
file_path = "./Sanremo_2025_Wikipedia.pdf"
loader = PyPDFLoader(file_path)
RAW_KNOWLEDGE_BASE = loader.load()

In [20]:
RAW_KNOWLEDGE_BASE[0].metadata

{'producer': 'Skia/PDF m136',
 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
 'creationdate': '2025-05-29T13:46:05+00:00',
 'title': 'Sanremo Music Festival 2025 - Wikipedia',
 'moddate': '2025-05-29T13:46:05+00:00',
 'source': './Sanremo_2025_Wikipedia.pdf',
 'total_pages': 21,
 'page': 0,
 'page_label': '1'}


# **Part 2: Repeat the Lab with Your Own Knowledge Base + RAG Tuning**

## **Goal:**

Practice building a **RAG pipeline** and explore how **chunk size** and **chunk overlap** affect the quality of LLM answers to different questions.

---

## **What You Need to Do:**

1. **Repeat the Lab Using Your PDF Knowledge Base**

   * Use the PDF file you selected and loaded in Part 1.

2. **Create 3 Different Questions**

   * Design **three meaningful, specific questions** based on your topic.
   * Each question must be clearly related to the content of your PDF.

3. **Run RAG for Each Question with 3 Different Settings:**
   For each question:

   * Run the RAG pipeline **three times** using different settings for:

     * `chunk_size` (e.g., 100, 300, 500)
     * `chunk_overlap` (e.g., 0, 20, 50, 100)
   * This means you will run a total of **9 tests** (3 questions × 3 settings each).


4. **Answer Each Question Using an LLM**

   * Use the loaded chunks and a retriever to find relevant parts.
   * Pass the retrieved context to the LLM and generate an answer.
   * You can use similar tools as we used in the Lab

5. **Explain Your Results**
   For each of the 3 questions:

   * Write a short **description of the question** and **why you chose it**.
   * **Compare the answers** you got using different settings.
   * Reflect on:

     * How answer quality changed with different `chunk_size` and `chunk_overlap`
     * Which setting gave the most useful or accurate result
     * Why you think it performed better/worse

---

## **Deliverables:**

* Python code used for RAG pipeline (with different chunking settings)
* PDF file from Part 1
* A JSON file named rag_report_last_name_name_id.json containing your results:

  * 3 questions with explanations
  * Generated answers for each setting
  * Comparison and reflection on the results

---


In [21]:
def split_documents(chunk_size: int, chunk_overlap: int, knowledge_base: List, tokenizer_name: str) -> List:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs_processed = text_splitter.split_documents(knowledge_base)

    return docs_processed

In [22]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
docs_processed = split_documents(
    chunk_size=100,
    chunk_overlap=20,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

READER_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context, give a comprehensive answer to the question. Respond only to the question asked, response should be concise and relevant to the question. If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context: {context}
        ---
        Now here is the question you need to answer.
        Question: {question}""",
    },
]

RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format,
    tokenize=False, # Return a string, not token IDs
    add_generation_prompt=True # Ensures model knows where to start generating
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [37]:
READER_LLM(question1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[{'generated_text': '\n   A) The twenty-fifth\n   B) The thirty-first\n   C) Both A and B are correct.\n   D) None of the above is correct.\n   \n   Answer: C) Both A and B are correct, as it refers to both "Sanremo Music Festival" (the twenty-fifth edition in this context), and also specifically mentions that there were three editions held during World War II between 1935–40, making a total count up until 2025 inclusive would be at least forty-one if we consider each year\'s event separately from those wartime years. However, since they occurred within one calendar year but not consecutively due to war disruzioni, counting them individually may lead to confusion without additional historical information specifying how many times the festival took place annually outside these specific WWII occurrences. Therefore, for an accurate answer based on provided data alone, only option A can be confirmed with certainty—it marks its twentieth anniversary celebration in 2025. Option B could poten

In [38]:
READER_LLM(question2)

[{'generated_text': '\nOptions: (A) Elena Ferrante (B) Enzo G. Ceragioli (C) Antonio Amore (D) Luca Moretti'}]

In [39]:
READER_LLM(question3)

[{'generated_text': "\n   A) Maria Laura Rinaldi\n   B) Antonello Venditti\n   C) Gianni Morandi\n   D) Lucio Dalla\n   \n   Answer: The question does not provide information about who presented in 2025. Please refer to official sources for this data as it may vary each year and could be different from past years' winners or notable artists like those listed above, which are known but do not necessarily correspond with festival hosts."}]

In [None]:
docs_processed = split_documents(
    chunk_size=100,
    chunk_overlap=20,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query(question1), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question1, context=context)

# Redact an answer
answer1_1 = READER_LLM(final_prompt)[0]["generated_text"]
answer1_1

' The sixth edition of the Sanremo Festival took place in 2025.'

In [28]:
docs_processed = split_documents(
    chunk_size=100,
    chunk_overlap=20,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question2, context=context)

# Redact an answer
answer1_2 = READER_LLM(final_prompt)[0]["generated_text"]
answer1_2

" The document provided does not contain any information about the winners of the first or second nights at the 2025 Sanremo Festival; it solely discusses performances by Italy's representatives for that year's Eurovision contest without mentioning specific results or names associated with those initial events. Therefore, I am unable to provide an answer based on this context alone."

In [29]:
docs_processed = split_documents(
    chunk_size=100,
    chunk_overlap=20,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question3, context=context)

# Redact an answer
answer1_3 = READER_LLM(final_prompt)[0]["generated_text"]

In [30]:
answer1_3

' The document does not provide any specific names for the presenters at the 2025 Sanremo Festival. It mentions that Italy participated with artists such as Mirko Onofrio and Riccardo Zangirolami performing during the event but lacks details about who hosted or presented it. Therefore, I cannot provide an accurate answer based on this context alone.'

In [31]:
docs_processed = split_documents(
    chunk_size=300,
    chunk_overlap=50,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question1, context=context)

# Redact an answer
answer2_1 = READER_LLM(final_prompt)[0]["generated_text"]
answer2_1

" The provided text does not contain any specific details about editions or years related to the Sanremo Festival for the year 2025; therefore, it's impossible to determine this based solely on the given document excerpts."

In [32]:
docs_processed = split_documents(
    chunk_size=300,
    chunk_overlap=50,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question2, context=context)

# Redact an answer
answer2_2 = READER_LLM(final_prompt)[0]["generated_text"]
answer2_2

" The document provided does not contain any specific details about winners or results for individual evenings at the 2025 Sanremo Festival; it simply lists performers (Mirko Onofrio and Riccardo Zangirolami) who appeared during the event's fourth evening without mentioning that they were competitors or indicating their performance order. Therefore, based on this text alone, I am unable to determine the winner of the first night."

In [33]:
docs_processed = split_documents(
    chunk_size=300,
    chunk_overlap=50,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question3, context=context)

# Redact an answer
answer2_3 = READER_LLM(final_prompt)[0]["generated_text"]
answer2_3

' The document does not provide any specific name for the presenter at the 2025 Sanremo Festival; it only mentions that Italy participated with artists such as Mirko Onofrio and Riccardo Zangirolami performing their covers without a live orchestra due to being sung a capella. Therefore, I am unable to determine who presented the event based solely on this provided text.'

In [34]:
docs_processed = split_documents(
    chunk_size=500,
    chunk_overlap=100,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question1, context=context)

# Redact an answer
answer3_1 = READER_LLM(final_prompt)[0]["generated_text"]
answer3_1

' The extracted documents provided no specific details about any "Sanremo Festival" or its editions for the year 2025; they solely discuss Italy\'s participation in the Eurovision Song Contest that same year with performers Mirko Onofrio and Riccardo Zangirolami performing without a live conductor due to it being an a capella performance. Therefore, I cannot provide an answer based on this context regarding the Sanremo Festival.'

In [35]:
docs_processed = split_documents(
    chunk_size=500,
    chunk_overlap=100,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question2, context=context)

# Redact an answer
answer3_2 = READER_LLM(final_prompt)[0]["generated_text"]
answer3_2

' The extracted documents provided no specific details about who won the first night of the 2025 Sanremo Festival; therefore, I am unable to provide that information based solely on this context.'

In [36]:
docs_processed = split_documents(
    chunk_size=500,
    chunk_overlap=100,
    knowledge_base=RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

vector_store = Chroma.from_documents(
    docs_processed,
    embedding_model,
    persist_directory="db9",
    collection_metadata={"hnsw:space": "cosine"}
)

results = vector_store.similarity_search_by_vector(
    embedding=embedding_model.embed_query("What are results of European Election in 2024?"), k=4
)

retrieved_docs_text = [doc.page_content for doc in results]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=question3, context=context)

# Redact an answer
answer3_3 = READER_LLM(final_prompt)[0]["generated_text"]
answer3_3

" The extracted documents provided no specific mention about who presented the 2025 San Remo Festival; they solely focus on Italy's participation at the Eurovision Song Contest that year with details regarding performers but lack any reference or detail related to festival presentation personnel. Therefore, based on this document alone we can’t determine the presenter for the event."

### Template for your resulting json file with report

In [9]:
your_results_dict = {
  "topic": "Sanremo Festival 2025",
  "question":"In 2025 which number edition of the Sanremo Festival was conducted?",
  "answer":" A) The twenty-fifth\n   B) The thirty-first\n   C) Both A and B are correct.\n   D) None of the above is correct.\n   Answer: C) Both A and B are correct, as it refers to both Sanremo Music Festival (the twenty-fifth edition in this context), and also specifically mentions that there were three editions held during World War II between 1935–40, making a total count up until 2025 inclusive would be at least forty-one if we consider each year\'s event separately from those wartime years. However, since they occurred within one calendar year but not consecutively due to war disruzioni, counting them individually may lead to confusion without additional historical information specifying how many times the festival took place annually outside these specific WWII occurrences. Therefore, for an accurate answer based on provided data alone, only option A can be confirmed with certainty—it marks its twentieth anniversary celebration in 2025. Option B could potentially refer to another unrelated annual music festival or different events altogether.",
  "rag": [
    {
      "question": "In 2025 which number edition of the Sanremo Festival was conducted?",
      "reason": "Because was the 75 edition, an important milestone",
      "experiments": [
        {
          "chunk_size": "100",
          "chunk_overlap": "20",
          "answer": "The sixth edition of the Sanremo Festival took place in 2025.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        },
        {
          "chunk_size": "100",
          "chunk_overlap": "20",
          "answer": "The document provided does not contain any information about the winners of the first or second nights at the 2025 Sanremo Festival; it solely discusses performances by Italy's representatives for that year's Eurovision contest without mentioning specific results or names associated with those initial events. Therefore, I am unable to provide an answer based on this context alone.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        },
        {
          "chunk_size": "100",
          "chunk_overlap": "20",
          "answer": "The document does not provide any specific names for the presenters at the 2025 Sanremo Festival. It mentions that Italy participated with artists such as Mirko Onofrio and Riccardo Zangirolami performing during the event but lacks details about who hosted or presented it. Therefore, I cannot provide an accurate answer based on this context alone.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        }
      ]
    },
    {
      "question": "Who was the winner of the first night of the 2025 Sanremo Festival?",
      "reason": "Explain why this question is meaningful to your topic",
      "experiments": [
        {
          "chunk_size": "300",
          "chunk_overlap": "50",
          "answer": "The provided text does not contain any specific details about editions or years related to the Sanremo Festival for the year 2025; therefore, it's impossible to determine this based solely on the given document excerpts.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        },
        {
          "chunk_size": "300",
          "chunk_overlap": "50",
          "answer": "The document provided does not contain any specific details about winners or results for individual evenings at the 2025 Sanremo Festival; it simply lists performers (Mirko Onofrio and Riccardo Zangirolami) who appeared during the event's fourth evening without mentioning that they were competitors or indicating their performance order. Therefore, based on this text alone, I am unable to determine the winner of the first night.",
          "reflection": " "
        },
        {
          "chunk_size": "300",
          "chunk_overlap": "50",
          "answer": "The document does not provide any specific name for the presenter at the 2025 Sanremo Festival; it only mentions that Italy participated with artists such as Mirko Onofrio and Riccardo Zangirolami performing their covers without a live orchestra due to being sung a capella. Therefore, I am unable to determine who presented the event based solely on this provided text.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        }
      ]
    },
    {
      "question": "Who was the presenter of the 2025 Sanremo Festival?",
      "reason": "Explain why this question is useful or interesting",
      "experiments": [
        {
          "chunk_size": "500",
          "chunk_overlap": "100",
          "answer": "The extracted documents provided no specific details about any Sanremo Festival or its editions for the year 2025; they solely discuss Italy\'s participation in the Eurovision Song Contest that same year with performers Mirko Onofrio and Riccardo Zangirolami performing without a live conductor due to it being an a capella performance. Therefore, I cannot provide an answer based on this context regarding the Sanremo Festival.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        },
        {
          "chunk_size": "500",
          "chunk_overlap": "100",
          "answer": "The extracted documents provided no specific details about who won the first night of the 2025 Sanremo Festival; therefore, I am unable to provide that information based solely on this context.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        },
        {
          "chunk_size": "500",
          "chunk_overlap": "100",
          "answer": "The extracted documents provided no specific mention about who presented the 2025 San Remo Festival; they solely focus on Italy's participation at the Eurovision Song Contest that year with details regarding performers but lack any reference or detail related to festival presentation personnel. Therefore, based on this document alone we can’t determine the presenter for the event.",
          "reflection": "The structure of the PDF and the specificity of the questions prevented the model from answering correctly and made it difficult to locate the relevant information within the file."
        }
      ]
    }
  ]
}

In [40]:
import json

with open("rag_report_Davide_Volpi_2140728.json", "w", encoding="utf-8") as f:
    json.dump(your_results_dict, f, indent=2, ensure_ascii=False)