In [1]:
import os
import json
import time
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# RAG & LangChain Imports
import chromadb
import langchain
import langchainhub
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.chat_models import init_chat_model
from langchain.agents import AgentState, create_agent
from langchain.agents.middleware import dynamic_prompt, ModelRequest
from langchain_core.messages import SystemMessage, HumanMessage

# Evaluation Metrics Imports
import krippendorff
from bert_score import score as bert_score
from ragas.metrics import answer_relevancy
from ragas import evaluate
from datasets import Dataset

### RAG Framework

In [3]:
openAI_embed = OpenAIEmbeddings(model="text-embedding-3-large") #

loader = CSVLoader(
    file_path="CleanCorpus.csv",
    encoding="utf-8"
)
documents = loader.load()

# Ingest into Chroma (using LangChain wrapper)
vector_store = Chroma.from_documents(
    documents=documents, 
    embedding=openAI_embed,
    collection_name="plantCorpus_langchain"
) #

# B. Define Dynamic Prompt Middleware
SYSTEM_PROMPT_TEMPLATE = """
## Role and Goal
You are an expert landscaping and agricultural assistant with deep, practical knowledge of the Indo-Gangetic plains. Your goal is to provide accurate, specific answers to user queries about plants.

## Source of Truth: Plant Corpus
You will be given context from a specialized `Plant Corpus`. This corpus is your **single source of truth** for all plant-specific data.

### Retrieved Context:
{corpus_content}

## Core Instructions and Rules

1.  **Prioritize Context:** You MUST base all plant facts, recommendations, and data **directly** on the provided `Plant Corpus` context. Do not use your general knowledge if it conflicts with the context.

2.  **Synthesize, Don't Just Repeat:** When answering, intelligently synthesize information from the context's fields. For example, to answer "what plant should I grow," you MUST combine `Climate Requirements`, `Soil Type`, and `Water Needs` to explain *why* a plant is suitable.

3.  **Handle Missing Information (Critical):** If the user asks about a plant, region, or topic that is **not** in the provided context, you MUST clearly state that the information is not available in your specialized corpus. However, you can use your general knowledge to invent an answer for a plant not in list.

4.  **Use Your Persona:** After you have provided the core facts *from the corpus*, you may add a brief, practical tip based on your persona (e.g., "In my experience on the plains, this plant also helps with soil erosion," or "Be sure to protect it from...").

5.  **Corpus Structure (For Your Reference):**
    `Plant ID`, `Common Name`, `Scientific Name`, `Local Name (If Applicable)`, `Region`, `Climate Requirements`, `Soil Type`, `Sun Light Needs`, `Water Needs`, `Growth Rate`, `Ecological Role`, `Traditional Uses`
"""

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    user_query = request.state["messages"][-1].text
    corpus_context = vector_store.similarity_search(query=user_query)

    corpus_content = "\n".join(doc.page_content for doc in corpus_context)

    system_message = SYSTEM_PROMPT_TEMPLATE.format(corpus_content=corpus_content)
    return system_message

# C. Initialize Models & Agents
model = init_chat_model("gpt-4.1") # Note: Ensure this model name is valid in your API, otherwise use "gpt-4o" or "gpt-4-turbo"

# The RAG Agent (Uses the middleware)
agent = create_agent(model, tools=[], middleware=[prompt_with_context]) 

# The Baseline Agent (No RAG, no middleware)
noRAG_agent = create_agent(model, tools=[], middleware=[]) 

print("RAG System Initialized.")

API Key loaded successfully.
