In [41]:
# Base Imports
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

# RAG Imports
import chromadb
import langchain
import langchainhub

In [42]:
# Embedding Function Import
import chromadb.utils.embedding_functions as embedding_functions
load_dotenv()
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key_env_var="OPEN_AI_API_KEY",
                model_name="text-embedding-3-small"
            )
df_corpus = pd.read_csv("Data520Corpus.csv")
chroma_client = chromadb.PersistentClient("./localCorpus")

In [43]:
df_corpus = df_corpus.fillna("NaN")
df_corpus.to_csv("CleanCorpus.csv")

In [58]:
# Chroma Collection
collection1 = chroma_client.get_or_create_collection(name = "plantCorpus", embedding_function=openai_ef)

In [60]:
# CSV Querying (We will query each row in our Corpus as a seperate document)
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader(file_path="CleanCorpus.csv",
                   encoding="utf-8")
documents = loader.load()

documents_to_add = documents[1:]  # Skip row 0 (the headers in original csv)

# Prepare data
all_contents = [doc.page_content for doc in documents_to_add]
all_metadatas = [doc.metadata for doc in documents_to_add]
all_ids = [f"plant_{i}" for i in range(len(documents_to_add))]

# Add to ChromaDB
collection1.add(
    documents=all_contents,
    metadatas=all_metadatas,
    ids=all_ids
)

In [61]:
collection1.peek()

{'ids': ['plant_0',
  'plant_1',
  'plant_2',
  'plant_3',
  'plant_4',
  'plant_5',
  'plant_6',
  'plant_7',
  'plant_8',
  'plant_9'],
 'embeddings': array([[ 0.03072607,  0.00280956,  0.06021716, ...,  0.00032399,
         -0.00184628,  0.01327593],
        [-0.000316  ,  0.01039302,  0.05567913, ...,  0.00421339,
          0.02272108,  0.01747775],
        [ 0.00710597, -0.0019561 ,  0.02388378, ...,  0.01023936,
          0.04368631,  0.01343916],
        ...,
        [-0.01771339, -0.01512244,  0.06028454, ..., -0.03773556,
          0.00674674,  0.04063435],
        [ 0.03344809, -0.0093409 ,  0.02935345, ...,  0.00417461,
          0.00749191,  0.02259729],
        [ 0.01522771,  0.03029722,  0.06291485, ..., -0.00327956,
          0.02504992,  0.02040909]], shape=(10, 1536)),
 'documents': [': 1\nPlant ID: 2\nCommon Name: Golden Leatherfern / Mangrove Fern\nScientific Name: Acrostichum aureum\nLocal Name (If Applicable): NaN\nRegion: NaN\nClimate Requirements: Prefers warm tr

In [38]:
collection1.count()

15

In [20]:
# Testing collection querying

collection1.query(
    query_texts=["what needs does tiny perwinkle have if i grow it?"],
    n_results= 2
)

{'ids': [['plant_4', 'plant_10']],
 'embeddings': None,
 'documents': [['Plant ID: 4\nCommon Name: 5\nScientific Name: Tiny Periwinkle\nLocal Name: Catharanthus roseus\nRegion: NaN\nClimate Requirements: NaN\nSoil Type: Warm tropical/subtropical; tolerant and hardy with low nutrient/water demands; occurs from sea level–900 m; found in open woods, shrublands, grasslands, disturbed sites, roadsides, beaches, and limestone rocks\nSun Light Needs: Well-drained soils; grows on varied substrates including limestone\nWater Needs: Full sun to partial shade\nGrowth Rate: Average overall; water moderately in growing season; sparingly in winter\nEcological Role: up to ~3 ft\nTraditional Uses: Insect-pollinated; adaptable pioneer in disturbed areas; supports urban/roadside greening\nNone: Medicinal use in Ayurveda/Folk/Modern contexts; used for organ-specific disorders incl. cancer, diabetes, hypertension; also cultivated as an ornamental',
   'Plant ID: 10\nCommon Name: 11\nScientific Name: Small

In [62]:
test_query = "Plants in india?"

results = collection1.query(
    query_texts=[test_query],
    n_results=3
)

In [63]:
retrieved_documents = results['documents'][0]

print("Query Results")
for doc in retrieved_documents:
    print(doc)
    print("\n") # Add a separator

Query Results
: 6
Plant ID: 7
Common Name: Hiptage, Helicopter flower
Scientific Name: Hiptage benghalensis
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Grows in damp places; needs presence of other trees in vicinity
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: Average
Growth Rate: 6–10 ft (vine)
Ecological Role: Forms woody creepers; spreads rapidly forming thickets and smothering vegetation; considered a weed in many regions
Traditional Uses: Medicinal: used for rheumatism, scabies, asthma, skin complaints & ulcers, inflammation, cough; systems: Ayurveda, Folk Medicine; parts used: root, bark, flower.


: 9
Plant ID: 10
Common Name: Mahua
Scientific Name: Madhuca longifolia
Local Name (If Applicable): NaN
Region: NaN
Climate Requirements: Tropical
Soil Type: NaN
Sun Light Needs: NaN
Water Needs: NaN
Growth Rate: Fast-growing; ~20 m tall (more than 10 ft)
Ecological Role: Spreading root system used to prevent soil erosion
Traditional Uses: Human consumption (e

In [67]:
# Retrieval Step (We Will intialize a vector_store from LangChain using our Collection from Chroma)
from langchain_chroma import Chroma

vetor_store = Chroma(
    collection_name="collection1",
    embedding_function=openai_ef,
    client=chroma_client
)