In [2]:
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [3]:
from langchain_ollama import OllamaLLM

# Your local judge model
judge = OllamaLLM(model="gemma2:2b") 

In [4]:
prompt_template = """
You are an evaluator that checks if an answer is grounded in the reference context.

Question: {question}

Reference context:
{context}

Answer:
{answer}

Evaluate if the answer is faithful to the context.
Reply only with one of the following:
- "Faithful" (if it is accurate and grounded)
- "Not faithful" (if it invents or contradicts information)
"""

In [5]:
question = "What is LangChain?"
context = "LangChain is a Python framework for building applications using LLMs."
answer = "LangChain is a Python library that helps in developing LLM-based applications."

# Use your local model to judge
grade = judge.invoke(prompt_template.format(question=question, context=context, answer=answer))
print("Evaluation Result:", grade)

Evaluation Result: Faithful 



In [6]:
question = "What is LangChain?"
context = "LangChain is a Python framework for building applications using LLMs."
answer =  "LangChain components"

# Use your local model to judge
grade = judge.invoke(prompt_template.format(question=question, context=context, answer=answer))
print("Evaluation Result:", grade)

Evaluation Result: Not faithful 



In [7]:
from langchain_ollama import OllamaLLM

# Your local model (you can use llama3, gemma2:2b, mistral, etc.)
llm = OllamaLLM(model="gemma2:2b")

question = "What is LangChain?"
reference_answer = "LangChain is a Python framework for building applications that use large language models."

# Generate answer
prompt = f"Answer this question:\n{question}"
generated_answer = llm.invoke(prompt)

print("Generated Answer:", generated_answer)
print("Reference Answer:", reference_answer)


Generated Answer: LangChain is a powerful framework for building applications powered by large language models (LLMs).  Think of it as **a toolbox** specifically designed to make working with these complex AI systems easier and more efficient.

Here's a breakdown: 

**What it does:**

* **Connects LLMs to other data sources:** LangChain allows you to easily combine the vast knowledge of LLMs (like GPT-3) with real-world information, like databases or documents. This means your applications can access relevant and updated details beyond just the text generated by the LLM. 
* **Facilitates complex workflows:** It's not just about simple questions and answers. LangChain helps you build chains of actions – a sequence where one action triggers another, resulting in a more complex process (like summarizing a document before asking follow-up questions).
* **Improves user experience:** LangChain adds features like chat interfaces, document management systems, and question answering engines, ma

In [8]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Tokenize answers (BLEU compares word-level overlaps)
reference = [reference_answer.split()]  # reference must be a list of lists
candidate = generated_answer.split()

# Add smoothing (BLEU can be zero if words differ slightly)
smooth_fn = SmoothingFunction().method1
bleu = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)

print(f"\nBLEU Score: {bleu:.2f}")


BLEU Score: 0.01
