### Imports

In [1]:
import os
import json
import time
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# RAG & LangChain Imports
import chromadb
import langchain
import langchainhub
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.chat_models import init_chat_model
from langchain.agents import AgentState, create_agent
from langchain.agents.middleware import dynamic_prompt, ModelRequest
from langchain_core.messages import SystemMessage, HumanMessage

# Evaluation Metrics Imports
import krippendorff
from bert_score import score as bert_score
from ragas.metrics import answer_relevancy
from ragas import evaluate
from datasets import Dataset

<frozen abc>:106: LangGraphDeprecatedSinceV10: AgentStatePydantic has been moved to `langchain.agents`. Please update your import to `from langchain.agents import AgentStatePydantic`. Deprecated in LangGraph V1.0 to be removed in V2.0.


### RAG Framework

In [2]:
load_dotenv(override=True)
openAI_embed = OpenAIEmbeddings(model="text-embedding-3-large") #

loader = CSVLoader(
    file_path="CleanCorpus.csv",
    encoding="utf-8"
)
documents = loader.load()

vector_store = Chroma.from_documents(
    documents=documents, 
    embedding=openAI_embed,
    collection_name="plantCorpus_langchain"
)

SYSTEM_PROMPT_TEMPLATE = """
## Role and Goal
You are an expert landscaping and agricultural assistant with deep, practical knowledge of the Indo-Gangetic plains. Your goal is to provide accurate, specific answers to user queries about plants.

## Source of Truth: Plant Corpus
You will be given context from a specialized `Plant Corpus`. This corpus is your **single source of truth** for all plant-specific data.

### Retrieved Context:
{corpus_content}

## Core Instructions and Rules

1.  **Prioritize Context:** You MUST base all plant facts, recommendations, and data **directly** on the provided `Plant Corpus` context. Do not use your general knowledge if it conflicts with the context.

2.  **Synthesize, Don't Just Repeat:** When answering, intelligently synthesize information from the context's fields. For example, to answer "what plant should I grow," you MUST combine `Climate Requirements`, `Soil Type`, and `Water Needs` to explain *why* a plant is suitable.

3.  **Handle Missing Information (Critical):** If the user asks about a plant, region, or topic that is **not** in the provided context, you MUST clearly state that the information is not available in your specialized corpus. However, you can use your general knowledge to invent an answer for a plant not in list.

4.  **Use Your Persona:** After you have provided the core facts *from the corpus*, you may add a brief, practical tip based on your persona (e.g., "In my experience on the plains, this plant also helps with soil erosion," or "Be sure to protect it from...").

5.  **Corpus Structure (For Your Reference):**
    `Plant ID`, `Common Name`, `Scientific Name`, `Local Name (If Applicable)`, `Region`, `Climate Requirements`, `Soil Type`, `Sun Light Needs`, `Water Needs`, `Growth Rate`, `Ecological Role`, `Traditional Uses`
"""

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    user_query = request.state["messages"][-1].text
    corpus_context = vector_store.similarity_search(query=user_query)

    corpus_content = "\n".join(doc.page_content for doc in corpus_context)

    system_message = SYSTEM_PROMPT_TEMPLATE.format(corpus_content=corpus_content)
    return system_message

model = init_chat_model("gpt-4.1")

agent = create_agent(model, tools=[], middleware=[prompt_with_context]) 

noRAG_agent = create_agent(model, tools=[], middleware=[]) 

In [3]:
df_corpus = pd.read_csv("Corpus.csv")
df_corpus = df_corpus.fillna("NaN")
df_corpus.to_csv("CleanCorpus.csv")

openAI_embed = OpenAIEmbeddings(model="text-embedding-3-large") 

loader = CSVLoader(
    file_path="CleanCorpus.csv",
    encoding="utf-8"
)
documents = loader.load()

vector_store = Chroma.from_documents(
    documents=documents, 
    embedding=openAI_embed,
    collection_name="plantCorpus_langchain"
)

print(f"Ingested {len(documents)} documents into ChromaDB.")

Ingested 25 documents into ChromaDB.


In [None]:
evaluator_personas = {
    "Evaluator_1_Landscape_Architect": (
        "You are a Senior Landscape Architect for US Embassies. "
        "You prioritize aesthetics, formal structure, and low maintenance. "
        "You are strict about visual appeal and infrastructure safety."
    ),
    "Evaluator_2_Local_Botanist": (
        "You are a PhD Botanist specializing in the Indo-Gangetic Plain. "
        "You care deeply about scientific accuracy, correct Latin names, "
        "and specific soil/climate requirements. You dislike vague answers."
    ),
    "Evaluator_3_Cultural_Historian": (
        "You are an expert in Indian Ethnobotany and Folklore. "
        "You focus on cultural relevance, traditional uses (Ayurveda), "
        "and local naming conventions. You want to see cultural depth."
    ),
    "Evaluator_4_Sustainability_Officer": (
        "You are a Sustainability Officer focused on water conservation. "
        "You heavily penalize plants that require too much water or fertilizer. "
        "You prioritize ecological suitability and native species."
    ),
    "Evaluator_5_General_Resident": (
        "You are a homeowner in the region with average gardening skills. "
        "You care about 'Overall Helpfulness' and simple, clear advice. "
        "You find overly technical jargon unhelpful."
    )
}

def get_evaluation(judge_persona, user_query, model_response):
    system_prompt = f"""
    {judge_persona}
    
    You are evaluating an AI assistant's response to a user query about plants in the Indo-Gangetic Plains.
    
    Please rate the response on these 4 metrics using a 1-5 Likert Scale:
    1. Factual Accuracy (1=Hallucinated/Wrong, 5=Highly Accurate/Cited)
    2. Ecological Suitability (1=Invasive/Deadly, 5=Perfect for Region)
    3. Cultural Relevance (1=Generic/Ignorant, 5=Culturally Insightful/Local Context)
    4. Overall Helpfulness (1=Useless, 5=Very Helpful)
    
    RETURN JSON ONLY in this format:
    {{
        "Factual_Accuracy": int,
        "Ecological_Suitability": int,
        "Cultural_Relevance": int,
        "Overall_Helpfulness": int,
        "Reasoning": "Short explanation (max 1 sentence)"
    }}
    """
    user_content = f"""USER QUERY = {user_query}
                    AI RESPONSE = {model_response}"""
    
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=user_content)
    ]

    try:
        response = model.invoke(messages)
        content_str = response.content.replace("```json", "").replace("```", "").strip()
        return json.loads(content_str)
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        return None

In [4]:
try:
    with open("rag.json", 'r', encoding="utf-8") as rag_file:
        rag_data = json.load(rag_file)
    with open("norag.json", 'r', encoding="utf-8") as norag_file:
        norag_data = json.load(norag_file)
    print(f"Loaded {len(rag_data)} RAG responses and {len(norag_data)} NoRAG responses.")
except FileNotFoundError:
    print("rag.json or norag.json not found")
    rag_data, norag_data = [], []

results = []
if rag_data and norag_data:
    for i, (rag_item, norag_item) in enumerate(zip(rag_data, norag_data)):
        query = rag_item['query']
        rag_response = rag_item['response']
        norag_response = norag_item['response']

        if rag_item['query'] != norag_item['query']:
            print(f"Warning: Mismatch in queries at index {i}. Skipping.")
            continue

        print(f"Evaluating Query {i+1}/{len(rag_data)}")

        for evaluator_name, persona_prompt in evaluator_personas.items():
            rag_grade = get_evaluation(persona_prompt, query, rag_response)
            if rag_grade:
                results.append({
                    "Query_ID": i+1,
                    "System": "RAG",
                    "Evaluator": evaluator_name,
                    **rag_grade 
                })
            
            norag_grade = get_evaluation(persona_prompt, query, norag_response)
            if norag_grade:
                results.append({
                    "Query_ID": i+1,
                    "System": "No-RAG",
                    "Evaluator": evaluator_name,
                    **norag_grade 
                })
        time.sleep(1)

    df_results = pd.DataFrame(results)
    df_results.to_csv("evaluation_results.csv", index=False)
    print("Evaluation Complete. Saved to evaluation_results.csv")




In [None]:
df_results = pd.read_csv("evaluation_results.csv")
with open("ragList.json", 'r', encoding="utf-8") as rag_file:
        rag_data = json.load(rag_file)
with open("nonragList.json", 'r', encoding="utf-8") as norag_file:
        norag_data = json.load(norag_file)

# Krippendorff's Alpha
if not df_results.empty:
    print("\n=== INTER-RATER RELIABILITY (Krippendorff's Alpha) ===")
    metrics = ["Factual_Accuracy", "Ecological_Suitability", "Cultural_Relevance", "Overall_Helpfulness"]
    df_results['Response_ID'] = df_results['Query_ID'].astype(str) + "_" + df_results['System']

    for metric in metrics:
        try:
            pivot_table = df_results.pivot(index='Response_ID', columns='Evaluator', values=metric)
            reliability_data = pivot_table.transpose().to_numpy()
            alpha = krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='ordinal')
            print(f"{metric:<25}: {alpha:.4f}")
        except Exception as e:
            print(f"Could not calculate alpha for {metric}: {e}")

# BERTScore
df_rag = pd.DataFrame(rag_data)
df_norag = pd.DataFrame(norag_data)
print("\n=== BERTScore (Semantic Divergence) ===")
if rag_data and norag_data:
    df_rag = pd.DataFrame(rag_data)
    df_norag = pd.DataFrame(norag_data)

    P, R, F1 = bert_score(
        df_rag['response'].to_list(),
        df_norag['response'].to_list(),
        lang="eng",
        verbose=False
    )
    df_rag['BERTScore_Similarity'] = F1.numpy()
    print(f"Avg Similarity to Base Model: {df_rag['BERTScore_Similarity'].mean():.3f}")

# RAGAS
print("\n=== RAGAS (Answer Relevancy) ===")
if rag_data:
    gpt4_llm = ChatOpenAI(model="gpt-4o", temperature=0)
    ragas_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    ragas_dataset = Dataset.from_dict({
        'question': df_rag['query'].tolist(),
        'answer': df_rag['response'].tolist(),
        'contexts': [[''] for _ in range(len(df_rag))]
    })

    ragas_results = evaluate(
        dataset=ragas_dataset, 
        metrics=[answer_relevancy], 
        llm=gpt4_llm, 
        embeddings=ragas_embeddings
    )

    df_rag['Ragas_Relevancy'] = ragas_results['answer_relevancy']
    print(f"Avg Relevancy Score: {df_rag['Ragas_Relevancy'].mean():.3f}")


=== INTER-RATER RELIABILITY (Krippendorff's Alpha) ===
Factual_Accuracy         : 0.7535
Ecological_Suitability   : 0.8050
Cultural_Relevance       : 0.8399
Overall_Helpfulness      : 0.8260

=== BERTScore (Semantic Divergence) ===


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]