In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import weaviate
import weaviate.classes as wvc
from sentence_transformers import CrossEncoder
from wrappers import LocalHuggingFaceEmbeddings, LocalHuggingFaceChatModel  
import pandas as pd
import os
import json
import glob
from dotenv import load_dotenv
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
WEAVIATE_HTTP_PORT = 8080
WEAVIATE_GRPC_PORT = 50051

COLLECTION_NAME = "SimpleRAG"
COLLECTION_NAME_v2 = "SimpleRAG_v2"
COLLECTION_NAME_v3 = "SimpleRAG_v3"

EMBEDDING_MODEL = "google/embeddinggemma-300m"
LLM_MODEL = "google/gemma-3-1b-it"
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

TOP_K = 5
TOP_K_V2= 10
TOP_K_V3= 5

In [3]:
# questions
with open('C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\test_results\\ai_response_expected.json', 'r') as file:
    data = json.load(file)

TEST_QUESTIONS = [item['question'] for item in data]

for i, question in enumerate(TEST_QUESTIONS, 1):
    print(f"{i}. {question}")

1. At what age should a child who does not walk be evaluated by a pediatrician?
2. What is a Munari mobile and at what distance should it be hung?
3. How does the height of an average fifteen-month-old boy compare to a girl of the same age?
4. What are the recommended characteristics of rhythmic language for young children?
5. What safety precautions are recommended for playgrounds to prevent injuries from falls?
6. How should an adult engage in 'self-expression' activities with a non-verbal child?
7. What is the purpose of the 'Box with Tray and Ball' activity for an eight-month-old?
8. By what age do most toddlers master at least fifty spoken words?
9. What physical changes occur in a toddler's face and limbs as they become more active?
10. What is the 'Gobbi mobile' and how is it constructed?
11. What vaccines are recommended for a toddler between twelve and fifteen months of age?
12. How does a three-year-old's ability to hold a crayon differ from younger toddlers?
13. What are the

In [4]:
embeddings_model = LocalHuggingFaceEmbeddings(EMBEDDING_MODEL)
chat_model = LocalHuggingFaceChatModel(LLM_MODEL)
reranker = CrossEncoder(RERANK_MODEL_NAME)

weaviate_client = weaviate.connect_to_local(
    host="localhost",
    port=WEAVIATE_HTTP_PORT,
    grpc_port=WEAVIATE_GRPC_PORT
)
rag_collection = weaviate_client.collections.get(COLLECTION_NAME)
rag_collection_v2 = weaviate_client.collections.get(COLLECTION_NAME_v2)
rag_collection_v3 = weaviate_client.collections.get(COLLECTION_NAME_v3)


expansion_prompt = ChatPromptTemplate.from_template(
    "You are an expert in information retrieval. "
    "Rephrase the following user query to be detailed and suitable for vector search. "
    "Return only the rephrased query.\n\nOriginal Query: '{query}'\n\nRephrased Query:"
)
query_expansion_chain = expansion_prompt | chat_model | StrOutputParser()

generation_prompt = ChatPromptTemplate.from_template(
    "You are a factual assistant. "
    "Answer the user's question only based on the provided context. "
    "Do not use external knowledge. "
    "Provide a concise answer in 3-5 sentences. "
    "If the answer is not in the context, say: "
    "'The provided context does not contain the answer to this question.'\n\n"
    "Context:\n{context}\n\nQuestion: {question}"
)
answer_generation_chain = generation_prompt | chat_model | StrOutputParser()

üì• Loading local embedding model: google/embeddinggemma-300m...
‚úÖ Local embedding model loaded successfully.
üì• Loading local LLM: google/gemma-3-1b-it...


`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cpu


‚úÖ Local LLM loaded successfully.


In [5]:
weaviate_client.collections.list_all()

{'SimpleRAG': _CollectionConfigSimple(name='SimpleRAG', description=None, generative_config=None, properties=[_Property(name='title', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={}), _Property(name='content', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={}), _Property(name='chunk_id', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vectorizer_configs={})], references=[], reranker_config=None, vectorizer_config=None, vect

In [6]:
def run_rag_test(
    rag_collection,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    TOP_K,
    version,
):
    
    results = []
    
    for question in TEST_QUESTIONS:

        # 1. Query expansion
        expanded_query = query_expansion_chain.invoke({"query": question})

        # 2. Embedding
        query_embedding = embeddings_model.embed_query(expanded_query)

        # 3. Retrieval
        retrieved_objects = rag_collection.query.near_vector(
            near_vector=query_embedding,
            limit=TOP_K,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )

        retrieved_docs = [obj.properties["content"] for obj in retrieved_objects.objects]
        context_for_llm = "\n\n---\n\n".join(retrieved_docs)

        # 4. Generation
        answer = answer_generation_chain.invoke({
            "context": context_for_llm,
            "question": question
        })


        # 5. Answer flag
        answer_found = 0 if answer.strip() == "The provided context does not contain the answer to this question." else 1

        results.append({
            "question": question,
            "answer": answer,
            "answer_found": answer_found
        })

    
    df = pd.DataFrame(results)

    # 6. Save to CSV
    df = pd.DataFrame(results)
    file_name = f"test_results/first_test_version{version}.csv"
    df.to_csv(file_name, index=False)
    print(f"Results saved to {file_name}")

    return df

In [7]:
def run_rag_w_rerank_test(
    rag_collection,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    reranker,
    TOP_K,
    version=1,
):
   
    results = []

    for question in TEST_QUESTIONS:

        # 1. Query expansion
        expanded_query = query_expansion_chain.invoke({"query": question})

        # 2. Embedding
        query_embedding = embeddings_model.embed_query(expanded_query)

        # 3. Initial retrieval 
        retrieved_objects = rag_collection.query.near_vector(
            near_vector=query_embedding,
            limit=TOP_K,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )

        retrieved_docs = [
            obj.properties["content"] for obj in retrieved_objects.objects
        ]

        if len(retrieved_docs) == 0:
            answer = "The provided context does not contain the answer to this question."
            results.append({
                "question": question,
                "answer": answer,
                "answer_found": 0
            })
            continue

        # 4. Re-ranking 
        rerank_inputs = [[question, doc] for doc in retrieved_docs]
        scores = reranker.predict(rerank_inputs)

        ranked_docs = [
            doc for doc, _ in sorted(
                zip(retrieved_docs, scores),
                key=lambda x: x[1],
                reverse=True
            )
        ]

        top_docs = ranked_docs[:TOP_K]
        context_for_llm = "\n\n---\n\n".join(top_docs)

        # 5. Generation
        answer = answer_generation_chain.invoke({
            "context": context_for_llm,
            "question": question
        })

        # 6. Answer flag
        answer_found = 0 if answer.strip() == \
            "The provided context does not contain the answer to this question." else 1

        results.append({
            "question": question,
            "answer": answer,
            "answer_found": answer_found
        })

    df = pd.DataFrame(results)

    # 7. Save results
    file_name = f"test_results/first_test_version{version}.csv"
    df.to_csv(file_name, index=False)
    print(f"Results saved to {file_name}")

    return df


In [None]:
def run_rag_hyde_test(
    rag_collection,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    chat_model, 
    TOP_K,
    version
):
   
    results = []

    for question in TEST_QUESTIONS:
        # 1. Query expansion
        expanded_query = query_expansion_chain.invoke({"query": question})

        # 2. HyDE: generate hypothetical document
        hyde_prompt = f"""
        You are a knowledge retriever. Generate a hypothetical document that would
        contain the answer to the following question. Be factual and concise.

        Question: {expanded_query}

        Return ONLY the content of the hypothetical document.
        """
        hypothetical_doc = chat_model.invoke(hyde_prompt)  

        # 3. Embeddings: question and hypothetical doc
        query_embedding = embeddings_model.embed_query(expanded_query)

        if hasattr(hypothetical_doc, "content"):
            hypothetical_doc = hypothetical_doc.content

        hyde_embedding = embeddings_model.embed_query(hypothetical_doc)

        # 4. Retrieval using HyDE embedding
        retrieved_objects = rag_collection.query.near_vector(
            near_vector=hyde_embedding,  
            limit=TOP_K,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )

        retrieved_docs = [obj.properties["content"] for obj in retrieved_objects.objects]
        context_for_llm = "\n\n---\n\n".join(retrieved_docs)

        # 6. Answer generation
        answer = answer_generation_chain.invoke({
            "context": context_for_llm,
            "question": question
        })

        # 7. Answer flag
        answer_found = 0 if answer.strip() == "The provided context does not contain the answer to this question." else 1

        results.append({
            "question": question,
            "answer": answer,
            "answer_found": answer_found,
            "hyde_doc": hypothetical_doc  
        })

    # 8. Save to CSV
    df = pd.DataFrame(results)
    file_name = f"test_results/first_test_version{version}.csv"
    df.to_csv(file_name, index=False)
    print(f"Results saved to {file_name}")

    return df

In [13]:
results_v1 = run_rag_test(
    rag_collection,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    TOP_K=5,
    version=1)

Results saved to test_results/first_test_version1.csv


In [14]:
results_v1

Unnamed: 0,question,answer,answer_found
0,At what age should a child who does not walk b...,At eighteen and twenty-four months.,1
1,What is a Munari mobile and at what distance s...,The provided context does not contain the answ...,0
2,How does the height of an average fifteen-mont...,The provided context does not contain the answ...,0
3,What are the recommended characteristics of rh...,The provided context does not contain the answ...,0
4,What safety precautions are recommended for pl...,"According to the text, playground equipment sh...",1
5,How should an adult engage in 'self-expression...,The provided context doesn‚Äôt explicitly detail...,1
6,What is the purpose of the 'Box with Tray and ...,The activity is designed to practice removing ...,1
7,By what age do most toddlers master at least f...,Most toddlers master at least fifty spoken wor...,1
8,What physical changes occur in a toddler's fac...,"As the toddler becomes more active, his arms a...",1
9,What is the 'Gobbi mobile' and how is it const...,The provided context does not contain the answ...,0


In [None]:
results_v2 = run_rag_test(
    rag_collection_v2,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    TOP_K = TOP_K_V2,
    version=2
)

Results saved to test_results/first_test_version2.csv


In [16]:
results_v2

Unnamed: 0,question,answer,answer_found
0,At what age should a child who does not walk b...,The provided context does not contain the answ...,0
1,What is a Munari mobile and at what distance s...,The provided text doesn‚Äôt contain the answer t...,1
2,How does the height of an average fifteen-mont...,The provided context does not contain the answ...,0
3,What are the recommended characteristics of rh...,The provided context does not contain the answ...,0
4,What safety precautions are recommended for pl...,The provided context does not contain the answ...,0
5,How should an adult engage in 'self-expression...,The provided context does not contain the answ...,0
6,What is the purpose of the 'Box with Tray and ...,The activity is designed to help an eight-mont...,1
7,By what age do most toddlers master at least f...,The provided context does not contain the answ...,0
8,What physical changes occur in a toddler's fac...,The provided context does not contain the answ...,0
9,What is the 'Gobbi mobile' and how is it const...,The ‚ÄúGobbi mobile‚Äù is a colour gradation of 5 ...,1


In [9]:
results_v3 = run_rag_test(
    rag_collection_v3,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    TOP_K_V3,
    version=3
)

Results saved to test_results/first_test_version3.csv


In [10]:
results_v3

Unnamed: 0,question,answer,answer_found
0,At what age should a child who does not walk b...,The provided context does not contain the answ...,0
1,What is a Munari mobile and at what distance s...,The provided context does not contain the answ...,0
2,How does the height of an average fifteen-mont...,The provided context does not contain the answ...,0
3,What are the recommended characteristics of rh...,"All rhythmic language should be simple, and it...",1
4,What safety precautions are recommended for pl...,The provided context does not contain the answ...,0
5,How should an adult engage in 'self-expression...,The provided context doesn‚Äôt offer specific in...,1
6,What is the purpose of the 'Box with Tray and ...,The purpose of the ‚ÄòBox with Tray and Ball‚Äô ac...,1
7,By what age do most toddlers master at least f...,Most toddlers master at least fifty spoken wor...,1
8,What physical changes occur in a toddler's fac...,"As a toddler becomes more active, their face w...",1
9,What is the 'Gobbi mobile' and how is it const...,The Gobbi mobile is a colour gradation of 5 or...,1


In [21]:
results_v4 = run_rag_w_rerank_test(
    rag_collection=rag_collection_v3,
    TEST_QUESTIONS=TEST_QUESTIONS,
    query_expansion_chain=query_expansion_chain,
    embeddings_model=embeddings_model,
    answer_generation_chain=answer_generation_chain,
    reranker=reranker,
    TOP_K=TOP_K_V3,
    version=4
)

Results saved to test_results/first_test_version4.csv


In [22]:
results_v4

Unnamed: 0,question,answer,answer_found
0,At what age should a child who does not walk b...,The provided context does not contain the answ...,0
1,What is a Munari mobile and at what distance s...,The provided context does not contain the answ...,0
2,How does the height of an average fifteen-mont...,"By the end of his first year, the average girl...",1
3,What are the recommended characteristics of rh...,"All ages Rhythmic Language ¬• Poetry, songs, rh...",1
4,What safety precautions are recommended for pl...,"During your child‚Äôs preschool years, you and y...",1
5,How should an adult engage in 'self-expression...,The provided context does not contain the answ...,0
6,What is the purpose of the 'Box with Tray and ...,The purpose of the ‚ÄòBox with Tray and Ball‚Äô ac...,1
7,By what age do most toddlers master at least f...,Most toddlers master at least fifty spoken wor...,1
8,What physical changes occur in a toddler's fac...,"As your toddler becomes more active, his arms ...",1
9,What is the 'Gobbi mobile' and how is it const...,The ‚ÄòGobbi mobile‚Äô is a colour gradation of 5 ...,1


In [None]:
result_v5 =  run_rag_hyde_test(rag_collection_v3,
    TEST_QUESTIONS,
    query_expansion_chain,
    embeddings_model,
    answer_generation_chain,
    chat_model, 
    TOP_K_V3,
    version=5)

Results saved to test_results/first_test_version4.csv


In [17]:
result_v5

Unnamed: 0,question,answer,answer_found,hyde_doc
0,At what age should a child who does not walk b...,The provided context does not contain the answ...,0,## Child Development Assessment ‚Äì Pediatric Ev...
1,What is a Munari mobile and at what distance s...,The provided context does not contain the answ...,0,## Munari Mobile Specifications & Technologies...
2,How does the height of an average fifteen-mont...,The provided context does not contain the answ...,0,## Child Age (15 Months) - Height Comparison\n...
3,What are the recommended characteristics of rh...,The provided context states that rhythmic lang...,1,## Child Development: A Holistic Approach to L...
4,What safety precautions are recommended for pl...,"During your child‚Äôs preschool years, you and y...",1,## Playground Safety Guidelines ‚Äì Fall Prevent...
5,How should an adult engage in 'self-expression...,The provided context doesn‚Äôt offer specific gu...,1,## Exploring Self-Expression with a Non-Verbal...
6,What is the purpose of the 'Box with Tray and ...,The ‚ÄòBox with Tray and Ball‚Äô activity is desig...,1,## The ‚ÄúBox with Tray and Ball‚Äù ‚Äì Eight-Month-...
7,By what age do most toddlers master at least f...,Most toddlers master at least fifty spoken wor...,1,## Longitudinal Toddler Language Acquisition: ...
8,What physical changes occur in a toddler's fac...,"As a toddler becomes more active, their arms a...",1,## Toddler Physical Activity & Morphological C...
9,What is the 'Gobbi mobile' and how is it const...,The ‚ÄòGobbi mobile‚Äô is a colour gradation of 5 ...,1,## Gobbi Mobile: A Comprehensive Overview\n\n*...


In [18]:
# LLM Judge 
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

In [19]:
def llm_score(question, expected, predicted):
    prompt = f"""You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system. 
    Question:
    {question}  
    Expected Answer:
    {expected}
    Model Answer:
    {predicted}

    Score the model answer from 0 to 100 based on:
    1. Factual correctness
    2. Completeness
    3. Faithfulness to the source

    Return ONLY a JSON object:
    {{"score": <0-100>, "justification": "<short explanation>"}}
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return json.loads(response.choices[0].message.content)

# Evaluation function
def test2_rag(dataset_path, version=2, TOP_K=10):
    with open(dataset_path, encoding="utf-8") as f:
        dataset = json.load(f)

    # 2. –ó–∞–≥—Ä—É–∂–∞–µ–º —Å–æ—Ö—Ä–∞–Ω—ë–Ω–Ω—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã RAG
    csv_path = f"C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\test_results\\first_test_version{version}.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"RAG results not found: {csv_path}")

    df = pd.read_csv(csv_path)

    # sanity check
    assert len(dataset) == len(df), "Dataset and RAG results size mismatch"

    # LLM score
    results = []
    for item, row in zip(dataset, df.to_dict(orient="records")):
        judge = llm_score(item["question"], item["expected_answer"], row["answer"])
        results.append({
            "id": item["id"],
            "question": item["question"],
            "predicted_answer": row["answer"],
            "expected_answer": item["expected_answer"],
            "score": judge["score"],
            "justification": judge["justification"]
        })

    # Save json
    output_json = f"C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\test_results\\rag_evaluation_v{version}.json"
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Results saved: {output_json}")

    return results

In [23]:
VERSIONS = [1, 2, 3, 4, 5]        
DATASETS_PATH = "C:\\Users\\tomir\\Desktop\\EPAM\\epam_train\\Module 4\\test_results\\ai_response_expected.json"

all_results = []
average_scores = {}

for dataset_path in glob.glob(DATASETS_PATH):
    print(f"üìÑ Evaluating dataset: {dataset_path}")

    for version in VERSIONS:
        print(f"Version {version}")
        results = test2_rag(
            dataset_path=dataset_path,
            version=version
        )
        # Calculate average score for this version
        if results:
            avg_score = sum(r["score"] for r in results) / len(results)
        else:
            avg_score = 0
        average_scores[version] = avg_score

        all_results.extend(results)


# average scores per version
print("Average Scores per Version:")
for version, avg in average_scores.items():
    print(f"Version {version}: {avg:.2f}")

üìÑ Evaluating dataset: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\ai_response_expected.json
Version 1
Results saved: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\rag_evaluation_v1.json
Version 2
Results saved: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\rag_evaluation_v2.json
Version 3
Results saved: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\rag_evaluation_v3.json
Version 4
Results saved: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\rag_evaluation_v4.json
Version 5
Results saved: C:\Users\tomir\Desktop\EPAM\epam_train\Module 4\test_results\rag_evaluation_v5.json
Average Scores per Version:
Version 1: 40.00
Version 2: 32.25
Version 3: 42.75
Version 4: 54.00
Version 5: 49.50
