In [2]:
#!pip install sentence-transformers faiss-cpu ollama pandas numpy


In [3]:
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
import ollama


In [4]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")





Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [5]:
documents = [
    "Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
    "Supervised learning uses labeled data to train models.",
    "Unsupervised learning works with unlabeled data.",
    "Reinforcement learning learns through rewards and penalties.",
    "Applications of machine learning include recommendation systems, fraud detection, natural language processing, computer vision, and predictive analytics."
]


In [6]:
embeddings = embedding_model.encode(documents)
embeddings = np.array(embeddings).astype("float32")
faiss.normalize_L2(embeddings)


In [7]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)


In [8]:
def retrieve(query, k=5):
    query_embedding = embedding_model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")
    faiss.normalize_L2(query_embedding)

    distances, indices = index.search(query_embedding, k)

    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "text": documents[idx],
            "score": distances[0][i]
        })

    return results


In [10]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, retrieved_chunks):
    pairs = [(query, chunk["text"]) for chunk in retrieved_chunks]
    scores = reranker.predict(pairs)

    for i, score in enumerate(scores):
        retrieved_chunks[i]["rerank_score"] = score

    return sorted(retrieved_chunks, key=lambda x: x["rerank_score"], reverse=True)


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [11]:
def generate_answer(query, context):
    prompt = f"""
Answer the question strictly using the provided context.
If the answer is not in the context, say "Not found in context."

Context:
{context}

Question:
{query}

Answer:
"""

    response = ollama.chat(
        model="llama3",
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0}
    )

    return response["message"]["content"]


In [12]:
def evaluate_faithfulness(query, context, answer):
    eval_prompt = f"""
You are evaluating a RAG system.

Context:
{context}

Question:
{query}

Answer:
{answer}

Does the answer contain any information NOT supported by the context?
Reply with:
Score: 1 (Not faithful) to 5 (Fully faithful)
Explanation:
"""

    response = ollama.chat(
        model="llama3",
        messages=[{"role": "user", "content": eval_prompt}],
        options={"temperature": 0}
    )

    return response["message"]["content"]


In [20]:
def evaluate_relevance(query, answer):
    eval_prompt = f"""
Evaluate how well the answer addresses the question.

Question:
{query}

Answer:
{answer}

Reply strictly in this format:
Score: <number between 1 and 5>
Explanation: <brief reason>
"""

    response = ollama.chat(
        model="llama3",
        messages=[{"role": "user", "content": eval_prompt}],
        options={
            "temperature": 0,
            "num_gpu": 0  # Force CPU to avoid CUDA crash
        }
    )

    return response["message"]["content"]


In [21]:
test_questions = [
    "What are applications of machine learning?",
    "Define supervised learning.",
    "Explain reinforcement learning."
]


In [22]:
results = []

for question in test_questions:
    print("\n==============================")
    print("Question:", question)

    retrieved = retrieve(question, k=5)
    reranked = rerank(question, retrieved)
    top_chunks = reranked[:3]

    context = "\n\n".join([chunk["text"] for chunk in top_chunks])

    answer = generate_answer(question, context)

    faithfulness = evaluate_faithfulness(question, context, answer)
    relevance = evaluate_relevance(question, answer)

    results.append({
        "Question": question,
        "Answer": answer,
        "Faithfulness": faithfulness,
        "Relevance": relevance
    })

    print("Answer:", answer)
    print("Faithfulness:", faithfulness)
    print("Relevance:", relevance)



Question: What are applications of machine learning?
Answer: According to the provided context, the answer is:

Recommendation systems, fraud detection, natural language processing, computer vision, and predictive analytics.
Faithfulness: Score: 5 (Fully faithful)

The answer provided contains all the applications of machine learning mentioned in the given context, and does not include any additional or unrelated information. The answer is a direct summary of the provided context, making it fully faithful.
Relevance: Score: 4
Explanation: The answer provides a list of specific applications of machine learning, which directly addresses the question. However, it would be more comprehensive if it included a brief explanation or examples for each application to further illustrate their relevance and importance.

Question: Define supervised learning.
Answer: According to the provided context, the answer is:

"Uses labeled data to train models."
Faithfulness: I'd rate this answer a 5 (Fully

In [23]:
df = pd.DataFrame(results)
df.to_csv("evaluation_report.csv", index=False)
df


Unnamed: 0,Question,Answer,Faithfulness,Relevance
0,What are applications of machine learning?,"According to the provided context, the answer ...",Score: 5 (Fully faithful)\n\nThe answer provid...,Score: 4\nExplanation: The answer provides a l...
1,Define supervised learning.,"According to the provided context, the answer ...",I'd rate this answer a 5 (Fully faithful).\n\n...,Score: 4\nExplanation: The answer provides a c...
2,Explain reinforcement learning.,Reinforcement learning learns through rewards ...,Score: 5 (Fully faithful)\n\nThe answer direct...,Score: 2\nExplanation: The answer only provide...
