In [1]:
# Base Imports
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

# RAG Imports
import chromadb
import langchain
import langchainhub

# API ENV Imports
import os
from dotenv import load_dotenv
load_dotenv() 

True

In [2]:
# Embedding Function Import
import chromadb.utils.embedding_functions as embedding_functions
load_dotenv()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key_env_var="OPEN_AI_API_KEY",
                model_name="text-embedding-3-small"
            )
df_corpus = pd.read_csv("Data520Corpus.csv")
chroma_client = chromadb.PersistentClient("./localCorpus")

In [3]:
df_corpus = df_corpus.fillna("NaN")
df_corpus.to_csv("CleanCorpus.csv")

In [4]:
# Chroma Collection
collection1 = chroma_client.get_or_create_collection(name = "plantCorpus", embedding_function=openai_ef)

In [5]:
# CSV Querying (We will query each row in our Corpus as a seperate document)
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path="CleanCorpus.csv",
                   encoding="utf-8")
documents = loader.load()

documents_to_add = documents[1:]  # Skip row 0 (the headers in original csv)

# Prepare data
all_contents = [doc.page_content for doc in documents_to_add]
all_metadatas = [doc.metadata for doc in documents_to_add]
all_ids = [f"plant_{i}" for i in range(len(documents_to_add))]

# Add to ChromaDB
collection1.add(
    documents=all_contents,
    metadatas=all_metadatas,
    ids=all_ids
)

In [6]:
collection1.peek()

{'ids': ['plant_0',
  'plant_1',
  'plant_2',
  'plant_3',
  'plant_4',
  'plant_5',
  'plant_6',
  'plant_7',
  'plant_8',
  'plant_9'],
 'embeddings': array([[ 0.03072607,  0.00280956,  0.06021716, ...,  0.00032399,
         -0.00184628,  0.01327593],
        [-0.000316  ,  0.01039302,  0.05567913, ...,  0.00421339,
          0.02272108,  0.01747775],
        [ 0.00710597, -0.0019561 ,  0.02388378, ...,  0.01023936,
          0.04368631,  0.01343916],
        ...,
        [-0.01771339, -0.01512244,  0.06028454, ..., -0.03773556,
          0.00674674,  0.04063435],
        [ 0.03344809, -0.0093409 ,  0.02935345, ...,  0.00417461,
          0.00749191,  0.02259729],
        [ 0.01522771,  0.03029722,  0.06291485, ..., -0.00327956,
          0.02504992,  0.02040909]], shape=(10, 1536)),
 'documents': [': 1\nPlant ID: 2\nCommon Name: Golden Leatherfern / Mangrove Fern\nScientific Name: Acrostichum aureum\nLocal Name (If Applicable): NaN\nRegion: NaN\nClimate Requirements: Prefers warm tr

In [23]:
collection1.count()

14

In [24]:
# Testing collection querying

collection1.query(
    query_texts=["what needs does tiny perwinkle have if i grow it?"],
    n_results= 2
)

{'ids': [['plant_3', 'plant_9']],
 'embeddings': None,
 'documents': [[': 4\nPlant ID: 5\nCommon Name: Tiny Periwinkle\nScientific Name: Catharanthus roseus\nLocal Name (If Applicable): NaN\nRegion: NaN\nClimate Requirements: Warm tropical/subtropical; tolerant and hardy with low nutrient/water demands; occurs from sea level–900 m; found in open woods, shrublands, grasslands, disturbed sites, roadsides, beaches, and limestone rocks\nSoil Type: Well-drained soils; grows on varied substrates including limestone\nSun Light Needs: Full sun to partial shade\nWater Needs: Average overall; water moderately in growing season; sparingly in winter\nGrowth Rate: up to ~3 ft\nEcological Role: Insect-pollinated; adaptable pioneer in disturbed areas; supports urban/roadside greening\nTraditional Uses: Medicinal use in Ayurveda/Folk/Modern contexts; used for organ-specific disorders incl. cancer, diabetes, hypertension; also cultivated as an ornamental',
   ': 10\nPlant ID: 11\nCommon Name: Small wat

In [25]:
test_query = "Plants in india?"

results = collection1.query(
    query_texts=[test_query],
    n_results=3
)

In [26]:
retrieved_documents = results['documents'][0]

print("Query Results")
for doc in retrieved_documents:
    print(doc)
    print("\n") # Add a separator

Query Results
: 6
Plant ID: 7
Common Name: Hiptage, Helicopter flower
Scientific Name: Hiptage benghalensis
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Grows in damp places; needs presence of other trees in vicinity
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: Average
Growth Rate: 6–10 ft (vine)
Ecological Role: Forms woody creepers; spreads rapidly forming thickets and smothering vegetation; considered a weed in many regions
Traditional Uses: Medicinal: used for rheumatism, scabies, asthma, skin complaints & ulcers, inflammation, cough; systems: Ayurveda, Folk Medicine; parts used: root, bark, flower.


: 9
Plant ID: 10
Common Name: Mahua
Scientific Name: Madhuca longifolia
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Tropical
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: NaN
Growth Rate: Fast-growing; ~20 m tall (more than 10 ft)
Ecological Role: Spreading root system used to prevent soil erosion
Traditional Uses: Human consumption (e

# RAG Framework

### Retrieval Step - Intialize a vector_store from LangChain using Chroma

In [3]:
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings 
from langchain_chroma import Chroma
import os
from dotenv import load_dotenv
load_dotenv() 
openAI_embed = OpenAIEmbeddings(model = "text-embedding-3-large", api_key = os.getenv("OPEN_AI_API_KEY"))
loader = CSVLoader(
    file_path="CleanCorpus.csv",
    encoding = "utf-8"
)
documents = loader.load()

vector_store = Chroma.from_documents(
    documents=documents, 
    embedding=openAI_embed,
    collection_name="plantCorpus_langchain"
)

### Dynamic prompt combining Retrieval 


In [4]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    user_query = request.state["messages"][-1].text
    corpus_context = vector_store.similarity_search(query=user_query, k = 4)

    corpus_content = "\n".join(doc.page_content for doc in corpus_context)
    prompt = """
## Role and Goal
You are an expert landscaping and agricultural assistant with deep, practical knowledge of the Indo-Gangetic plains. Your goal is to provide accurate, specific answers to user queries about plants.

## Source of Truth: Plant Corpus
You will be given context from a specialized `Plant Corpus`. This corpus is your **most valuable source of info and truth** for alot of plant-specific data that you should aim to heavilky use and complement with your research or output

## Core Instructions and Rules

1.  **Prioritize Context:** You are HIGHLY INCENTIVIZED AND ENCOURAGED to base a significant portion of plant facts, recommendations, and data **directly** on the provided `Plant Corpus` context. Do not use your general knowledge if it conflicts with the context. That being said, your general knowledge is still valuable, but make sure to not conflict corpus.


2.  **Synthesize, Don't Just Repeat:** When answering, intelligently synthesize information from the context's fields. For example, to answer "what plant should I grow," you MUST combine `Climate Requirements`, `Soil Type`, and `Water Needs` to explain *why* a plant is suitable. That being said make sure your response is robust and detailed, not sacrificing depth.


3.  **Handle Missing Information (Critical):** If the user asks about a plant, region, or topic that is **not** in the provided context, you MUST clearly state that the information is not available in your specialized corpus. **DO NOT** use your general knowledge to invent an answer for a plant not in list.


4.  **Use Your Persona:** After you have provided the core facts *from the corpus*, you may add a practical tips based on your persona (e.g., "In my experience on the plains, this plant also helps with soil erosion," or "Be sure to protect it from...").


5.  **Corpus Structure (For Your Reference):**
    `Plant ID`, `Common Name`, `Scientific Name`, `Local Name (If Applicable)`, `Region`, `Climate Requirements`, `Soil Type`, `Sun Light Needs`, `Water Needs`, `Growth Rate`, `Ecological Role`, `Traditional Uses`

6. **Retrieved Context**:
"""
    system_message = (prompt + corpus_content)
    return system_message

<frozen abc>:106: LangGraphDeprecatedSinceV10: AgentStatePydantic has been moved to `langchain.agents`. Please update your import to `from langchain.agents import AgentStatePydantic`. Deprecated in LangGraph V1.0 to be removed in V2.0.


In [5]:
import os
from langchain.chat_models import init_chat_model
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_API_KEY");

model = init_chat_model("gpt-4.1")

#### Create Agents

In [15]:
from langchain.agents import AgentState, create_agent
from langchain_core.messages import HumanMessage

agent = create_agent(model, tools=[], middleware = [prompt_with_context]) 
noRAG_agent = create_agent(model, tools=[], middleware = []) 

### Compare Model + Rag with Model with no Rag

In [7]:
query = "what plants would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?"
for step in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


what plants would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?

Based on the provided Plant Corpus and your conditions on the Indo-Gangetic Plain—specifically, **poor/nutrient-deficient soil** and **low water availability**—the best recommendation is **Neem (Azadirachta indica)**.

### Why Neem is Suitable for Your Conditions (Corpus-Based Synthesis):

- **Climate Requirements:** Neem thrives in tropical to subtropical climates, which matches the Indo-Gangetic plain.
- **Soil Type:** Prefers well-drained sandy to loamy soils, but is notably tolerant of poor and dry soils. This means low soil fertility is not a major barrier.
- **Water Needs:** Neem is highly drought-resistant and does not require much water; it’s one of the hardiest trees for dry regions.
- **Ecological Role:** Improves soil fertility over time, can provide shade and wind protection, and acts as a natural pest repellent—especial

In [19]:
for step in noRAG_agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


what plant would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?

In the Indo-Gangetic Plain, you generally have fertile alluvial soils, but if you’re facing poor soil nutrients and water scarcity, it’s wise to select hardy, drought-tolerant, and low input crops or plants.

**Recommended Plants/Crops:**

### 1. **Millets**
- **Pearl millet (Bajra)** and **finger millet (Ragi)** are **excellent choices**. They thrive in nutrient-poor soils and require less water compared to rice or wheat.
- **Foxtail millet** is another option.

### 2. **Pulses**
- **Mung bean (Moong dal)** and **black gram (Urad dal)** are drought-resistant and can fix nitrogen, which improves soil fertility.
- **Chickpea (Gram/Chana)** is also well-suited for dry conditions.

### 3. **Oilseeds**
- **Sesame (Til)** is resilient to both drought and poor nutrient soils.
- **Mustard** grows with less water than many other oilseeds.

#

### Prompt Comparisons

In [17]:
## Assume we have promtps stored in some list, called promptList

user_queries = [
    "Our village pond dries halfway in winter but floods in the monsoon. What native plants can survive this cycle and help keep the water clean?",
    "We need plants for a roadside planting project that can tolerate heat, dust, and very little watering. What hardy native species would work?",
    "My kitchen garden is in partial shade. Which local medicinal plants can I grow for everyday use?",
    "After the monsoon, our community center’s soil erodes badly along the edges. Which deep-rooted native trees or shrubs can stop this erosion?",
    "We want to create a small shaded seating area outside the panchayat office. Can you suggest plants that are culturally respected and provide good shade?",
    "Our farmland has begun to show salinity from over-irrigation. Are there native plants that can tolerate mild salt and still grow well?",
    "We want to teach our children about traditional plants. Which species have strong cultural or folk stories connected to this region?",
    "Which fast-growing plants can form a boundary hedge around our school that are safe for children and do not spread aggressively?",
    "Our hand-pump area gets muddy and slippery in the rainy season. Are there native groundcovers that can handle moisture but won’t attract pests?",
    "We want to plant something near our temple that has religious significance and supports birds or pollinators. What would you recommend?"
]

input_queries = [
    {"messages": [HumanMessage(content=prompt)]} 
    for prompt in user_queries
]

rag_response_list = agent.batch(
    inputs=input_queries,
    config={
        "max_concurrency": 2
    }
)
nonrag_response_list = noRAG_agent.batch(
    input_queries,
    config={
        "max_concurrency": 2
    }
)
# We gather all prompts responses in each respective response_list

In [24]:
def export_responses_to_json(queries, response_list, filename):
    """
    Processes a list of agent state dictionaries (the batch output) and 
    exports them as a list of {query: ..., response: ...} objects to a JSON file.
    """
    data_to_export = []
    
    # Use the number of responses to control the loop, as this is the actual data we have
    length = len(response_list)

    for i in range(length):
        response_state = response_list[i]
        
        try:
            # Extract the original user query from the FIRST message in the history
            original_query = response_state["messages"][0].content
            
            # Extract the final AI response from the LAST message in the history
            ai_response_text = response_state["messages"][-1].content
            
        except Exception as e:
            original_query = queries[i] if i < len(queries) else f"Unknown Query at index {i}"
            ai_response_text = f"Error extracting response for index {i}: {e}"
        
        data_to_export.append({
            "query": original_query,
            "response": ai_response_text
        })
        
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data_to_export, f, ensure_ascii=False, indent=4)



In [25]:
import json

export_responses_to_json(user_queries, rag_response_list, filename="ragListJson")
export_responses_to_json(user_queries, nonrag_response_list, filename="nonragListJson")