In [74]:
# Base Imports
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

# RAG Imports
import chromadb
import langchain
import langchainhub

In [75]:
# Embedding Function Import
import chromadb.utils.embedding_functions as embedding_functions
load_dotenv()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key_env_var="OPEN_AI_API_KEY",
                model_name="text-embedding-3-small"
            )
df_corpus = pd.read_csv("Data520Corpus.csv")
chroma_client = chromadb.PersistentClient("./localCorpus")

In [17]:
df_corpus = df_corpus.fillna("NaN")
df_corpus.to_csv("CleanCorpus.csv")

In [76]:
# Chroma Collection
collection1 = chroma_client.get_or_create_collection(name = "plantCorpus", embedding_function=openai_ef)

In [77]:
# CSV Querying (We will query each row in our Corpus as a seperate document)
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path="CleanCorpus.csv",
                   encoding="utf-8")
documents = loader.load()

documents_to_add = documents[1:]  # Skip row 0 (the headers in original csv)

# Prepare data
all_contents = [doc.page_content for doc in documents_to_add]
all_metadatas = [doc.metadata for doc in documents_to_add]
all_ids = [f"plant_{i}" for i in range(len(documents_to_add))]

# Add to ChromaDB
collection1.add(
    documents=all_contents,
    metadatas=all_metadatas,
    ids=all_ids
)

In [20]:
collection1.peek()

{'ids': ['plant_0',
  'plant_1',
  'plant_2',
  'plant_3',
  'plant_4',
  'plant_5',
  'plant_6',
  'plant_7',
  'plant_8',
  'plant_9'],
 'embeddings': array([[ 0.03072607,  0.00280956,  0.06021716, ...,  0.00032399,
         -0.00184628,  0.01327593],
        [-0.000316  ,  0.01039302,  0.05567913, ...,  0.00421339,
          0.02272108,  0.01747775],
        [ 0.00710597, -0.0019561 ,  0.02388378, ...,  0.01023936,
          0.04368631,  0.01343916],
        ...,
        [-0.01771339, -0.01512244,  0.06028454, ..., -0.03773556,
          0.00674674,  0.04063435],
        [ 0.03344809, -0.0093409 ,  0.02935345, ...,  0.00417461,
          0.00749191,  0.02259729],
        [ 0.01522771,  0.03029722,  0.06291485, ..., -0.00327956,
          0.02504992,  0.02040909]], shape=(10, 1536)),
 'documents': [': 1\nPlant ID: 2\nCommon Name: Golden Leatherfern / Mangrove Fern\nScientific Name: Acrostichum aureum\nLocal Name (If Applicable): NaN\nRegion: NaN\nClimate Requirements: Prefers warm tr

In [21]:
collection1.count()

14

In [78]:
# Testing collection querying

collection1.query(
    query_texts=["what needs does tiny perwinkle have if i grow it?"],
    n_results= 2
)

{'ids': [['plant_3', 'plant_9']],
 'embeddings': None,
 'documents': [[': 4\nPlant ID: 5\nCommon Name: Tiny Periwinkle\nScientific Name: Catharanthus roseus\nLocal Name (If Applicable): NaN\nRegion: NaN\nClimate Requirements: Warm tropical/subtropical; tolerant and hardy with low nutrient/water demands; occurs from sea level–900 m; found in open woods, shrublands, grasslands, disturbed sites, roadsides, beaches, and limestone rocks\nSoil Type: Well-drained soils; grows on varied substrates including limestone\nSun Light Needs: Full sun to partial shade\nWater Needs: Average overall; water moderately in growing season; sparingly in winter\nGrowth Rate: up to ~3 ft\nEcological Role: Insect-pollinated; adaptable pioneer in disturbed areas; supports urban/roadside greening\nTraditional Uses: Medicinal use in Ayurveda/Folk/Modern contexts; used for organ-specific disorders incl. cancer, diabetes, hypertension; also cultivated as an ornamental',
   ': 10\nPlant ID: 11\nCommon Name: Small wat

In [23]:
test_query = "Plants in india?"

results = collection1.query(
    query_texts=[test_query],
    n_results=3
)

In [24]:
retrieved_documents = results['documents'][0]

print("Query Results")
for doc in retrieved_documents:
    print(doc)
    print("\n") # Add a separator

Query Results
: 6
Plant ID: 7
Common Name: Hiptage, Helicopter flower
Scientific Name: Hiptage benghalensis
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Grows in damp places; needs presence of other trees in vicinity
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: Average
Growth Rate: 6–10 ft (vine)
Ecological Role: Forms woody creepers; spreads rapidly forming thickets and smothering vegetation; considered a weed in many regions
Traditional Uses: Medicinal: used for rheumatism, scabies, asthma, skin complaints & ulcers, inflammation, cough; systems: Ayurveda, Folk Medicine; parts used: root, bark, flower.


: 9
Plant ID: 10
Common Name: Mahua
Scientific Name: Madhuca longifolia
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Tropical
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: NaN
Growth Rate: Fast-growing; ~20 m tall (more than 10 ft)
Ecological Role: Spreading root system used to prevent soil erosion
Traditional Uses: Human consumption (e

# RAG Framework

### Retrieval Step - Intialize a vector_store from LangChain using Chroma

In [95]:
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings 
from langchain_chroma import Chroma
import os
from dotenv import load_dotenv

load_dotenv()

openAI_embed = OpenAIEmbeddings(model = "text-embedding-3-large")
loader = CSVLoader(
    file_path="CleanCorpus.csv",
    encoding = "utf-8"
)
documents = loader.load()

vector_store = Chroma.from_documents(
    documents=documents, 
    embedding=embeddings,
    collection_name="plantCorpus_langchain"
)

### Dynamic prompt combining Retrieval 


In [104]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    user_query = request.state["messages"][-1].text
    corpus_context = vector_store.similarity_search(query=user_query)

    corpus_content = "\n".join(doc.page_content for doc in corpus_context)

    system_message = (
        "You are a landscaping and agricultural assistant. Use the following content structured from a Plant Corpus to supplement your own response for a user's query. The context is structured as follows for your understanding to pull data: Plant ID, Common Name, Scientific Name, Local Name (If Applicable), Region, Climate Requirements, Soil Type, Sun Light Needs, Water Needs, Growth Rate, Ecological Role and Traditional Uses."
        f"\n\n{corpus_content}"
        )
    return system_message

In [105]:
import os
from langchain.chat_models import init_chat_model
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_API_KEY");

model = init_chat_model("gpt-4.1")

#### Create Agents

In [106]:
from langchain.agents import AgentState, create_agent

agent = create_agent(model, tools=[], middleware = [prompt_with_context]) 
noRAG_agent = create_agent(model, tools=[], middleware = []) 

### Compare Model + Rag with Model with no Rag


In [108]:
query = "what plant would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?"
for step in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


what plant would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?

For the Indo-Gangetic Plain with **poor (low-nutrient) soil** and **limited water availability**, **Neem (Azadirachta indica)** is highly recommended.

### Why Neem?
- **Climate Adaptation:** Neem thrives in tropical to subtropical climates, which matches the Indo-Gangetic plain.
- **Drought Tolerance:** It is *highly drought-resistant* and does well with *low water availability*.
- **Soil Requirements:** Prefers *well-drained sandy to loamy soils* but *tolerates poor and dry soils* exceptionally well.
- **Sunlight:** Requires *full sunlight*, which is abundant in most parts of the Indo-Gangetic plain.
- **Growth & Benefits:** Grows *moderately fast*, can provide *shade and wind protection*, and helps *improve soil fertility* over time.
- **Ecological Role:** Supports biodiversity and naturally repels pests, making it great both for 

In [109]:
for step in noRAG_agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


what plant would be reccomended to grow in my region (Indo-Gangetic Plain) where my conditions for soil are usually bad nutrients and not alot of water?

The Indo-Gangetic Plain is an agriculturally important region, but if you're facing *poor soil nutrients* and *limited water*, you will want drought-tolerant and low-maintenance plants that can adapt to marginal soils.

### **Recommended Plants for Your Conditions:**

#### **1. Pearl Millet (Bajra; Pennisetum glaucum)**
- **Why:** Very drought-tolerant, grows well in poor, sandy soils.
- **Usage:** Main food crop, fodder, good for improving soil organic matter.

#### **2. Sorghum (Jowar; Sorghum bicolor)**
- **Why:** Handles drought very well, requires little fertilizer.
- **Usage:** Good for grain, fodder, and soil conservation.

#### **3. Cluster Bean (Guar; Cyamopsis tetragonoloba)**
- **Why:** Leguminous, improves soil fertility, very tolerant to drought.
- **Usage:** Vegetable, source of guar gum, green manure.

#### **4. Moth B