In [1]:
from pathlib import Path
from langchain.schema import Document
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from supabase.client import Client, create_client
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
class Config:
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") 
    SUPABASE_URL = os.getenv("SUPABASE_URL")
    SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
    PRODUCT_COLLECTION_NAME = "lensfolia_collection"
    EMBEDDING_MODEL = "models/gemini-embedding-001"

In [3]:
supabase: Client = create_client(Config.SUPABASE_URL, Config.SUPABASE_ANON_KEY)

In [4]:
embeddings = GoogleGenerativeAIEmbeddings(
    api_key=Config.GOOGLE_API_KEY,
    model=Config.EMBEDDING_MODEL
)

In [5]:
all_docs = []

In [None]:
faq_path = Path("knowledge_base/faq/faq.md")

if faq_path.exists():
    loader = TextLoader(str(faq_path))
    faq_doc = loader.load()[0]
    
    # Split by question (h1)
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[("#", "question")]
    )
    
    faq_chunks = splitter.split_text(faq_doc.page_content)
    
    for chunk in faq_chunks:
        if not chunk.page_content.strip():
            continue
            
        question = chunk.metadata.get("question", "Unknown")
        
        combined_content = f"Q: {question}\n\nA: {chunk.page_content}"

        all_docs.append(
            Document(
                page_content=combined_content,  # Question + Answer
                metadata={
                    "doc_type": "faq",
                    "question": question,
                }
            )
        )


In [29]:
all_docs

[Document(metadata={'doc_type': 'faq', 'question': 'Apa itu LensFolia?'}, page_content='Q: Apa itu LensFolia?\n\nA: LensFolia adalah aplikasi diagnosis penyakit tanaman berbasis AI (Artificial Intelligence). Aplikasi ini membantu pengguna mengidentifikasi penyakit tanaman melalui gambar dan memberikan informasi lengkap, tips perawatan, serta rekomendasi produk.'),
 Document(metadata={'doc_type': 'faq', 'question': 'Fitur Utama LensFolia?'}, page_content='Q: Fitur Utama LensFolia?\n\nA: LensFolia memiliki tiga fitur utama:  \n**Dr. Lensi**  \nDr. Lensi adalah agen AI cerdas yang dapat:\n- Mendeteksi objek dan penyakit pada tanaman dari gambar.\n- Menggunakan pencarian web dan dokumen untuk memberikan informasi penyakit yang lebih luas.\n- Menyediakan ringkasan penyakit, tips perawatan, dan rekomendasi produk yang sesuai.  \n**Lenskipledia**  \nLenskipledia adalah ensiklopedia digital berisi informasi lengkap tentang:\n- Penyakit tanaman\n- Cara perawatan\n- Produk yang bisa digunakan un

In [9]:
docs_path = Path("knowledge_base/products/")
existing_files = {f.name for f in docs_path.glob("*.md")}

if not existing_files:
    raise FileNotFoundError(f"Missing required documents: {existing_files}")

In [10]:
for file_path in docs_path.glob("*.md"):
    group = file_path.stem  
    loader = TextLoader(str(file_path))
    doc = loader.load()[0]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "product_name")])
    product_docs = splitter.split_text(doc.page_content)
    
    for pd in product_docs:
        all_docs.append(
            Document(
                page_content=pd.page_content,
                metadata={
                    "doc_type": "product",
                    "product_name": pd.metadata["product_name"],
                    "group": group
                }
            )
        )

In [6]:
insectidisa_path = Path("knowledge_base/products/insektisida.md")

if insectidisa_path.exists():
    group = "insektisida"
    loader = TextLoader(str(insectidisa_path))
    doc = loader.load()[0]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "product_name")])
    product_docs = splitter.split_text(doc.page_content)
    
    for pd in product_docs:
        all_docs.append(
            Document(
                page_content=pd.page_content,
                metadata={
                    "doc_type": "product",
                    "product_name": pd.metadata["product_name"],
                    "group": group
                }
            )
        )


In [12]:
plants_path = Path("knowledge_base/plants/plants.md")
if plants_path.exists():
    loader = TextLoader(str(plants_path))
    plants_doc = loader.load()[0]
   
    # Split by plant (h1) and sections (h2, h3, h4)
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "plant_name"),
            ("##", "section"),
            ("###", "subsection"),
            ("####", "disease_name")
        ]
    )
    plant_chunks = splitter.split_text(plants_doc.page_content)
   
    for chunk in plant_chunks:
        # Skip empty content chunks
        if not chunk.page_content.strip():
            continue
           
        # Extract metadata
        plant_name = chunk.metadata.get("plant_name", "Unknown").lower()
        section = chunk.metadata.get("section", "General")
        subsection = chunk.metadata.get("subsection", "")
        disease_name = chunk.metadata.get("disease_name", "")
       
        # Create meaningful section identifier
        section_id = section
        if subsection:
            section_id = f"{section} - {subsection}"
        if disease_name:
            section_id = f"{section} - {disease_name}"
       
        all_docs.append(
            Document(
                page_content=chunk.page_content,
                metadata={
                    "doc_type": "plant_info",
                    "plant_name": plant_name,
                    "section": section_id,
                    "content_type": section.lower().replace(" ", "_")
                }
            )
        )

In [13]:
print(f"Total documents loaded: {len(all_docs)}")
print("\nDocument types:")
products = [d for d in all_docs if d.metadata["doc_type"] == "product"]
plants = [d for d in all_docs if d.metadata["doc_type"] == "plant_info"]
print(f"Products: {len(products)}")
print(f"Plant info: {len(plants)}")


Total documents loaded: 1987

Document types:
Products: 56
Plant info: 1931


In [7]:
all_docs

[Document(metadata={'doc_type': 'product', 'product_name': 'ANWAVIN 500 EC', 'group': 'insektisida'}, page_content='![Image of product: ANWAVIN 500 EC](https://kresna.co.id/sarikresnakimia/wp-content/uploads/2014/11/Anwavin-500-EC-1000x10001-360x360.png)  \n**Bahan Aktif:** Profenofos 500 g/l  \n**Deskripsi:** Insektisida racun kontak dan perut berbentuk pekatan yang dapat diemulsikan berwarna kuning kecoklatan untuk mengendalikan hama kutu daun ( Myzus Percisae ) dan thrips ( Thrips Parvispinus ) pada tanaman cabai.  \n**Pengaplikasian:** Penyemprotan Volume tinggi dilakukan segera setelah terlihat adanya serangan dan diulang 5 – 10 hari kemudian  \n**Petunjuk Penggunaan:**  \n| Tanaman | Gulma Sasaran | Dosis Anjuran |\n| --- | --- | --- |\n| Cabai | Ulat Grayak (Spodoptera litura) | 1-2 ml/l |\n| Cabai | Ulat Grayak (Spodoptera litura) | 1-2 ml/l |\n| Kubis | Ulat Daun (Plutella xylostella) | 1-2 ml/l |\n| Kubis | Ulat krop (Crocidoloma pavonana) | 1-2 ml/l |  \n---'),
 Document(met

In [None]:
vector_store = SupabaseVectorStore.from_documents(
    all_docs,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)

In [11]:
vector_store = SupabaseVectorStore(
    client=supabase,
    embedding=embeddings,
    table_name="documents",
    query_name="match_documents",
)

In [None]:
from tqdm.auto import tqdm

table_name = "documents"

# Delete all rows from the table if it exists
try:
    result = supabase.table(table_name).delete().gte('id', '').execute()
    print(f"Cleared existing data from {table_name}")
except Exception as e:
    print(f"Table {table_name} might not exist yet: {e}")

# Create Supabase vector store
batch_size = 10  # Start small, increase if it works
vector_store = None

batches = list(range(0, len(all_docs), batch_size))
for i in tqdm(batches, desc="Processing document batches"):
    batch = all_docs[i:i+batch_size]
    batch_num = i//batch_size + 1
    doc_range = f"{i+1} to {min(i+batch_size, len(all_docs))}"
    
    try:
        if vector_store is None:
            # Create the vector store with first batch
            vector_store = SupabaseVectorStore.from_documents(
                batch,
                embeddings,
                client=supabase,
                table_name="documents",
                query_name="match_documents",
            )
        else:
            # Add subsequent batches
            vector_store.add_documents(batch)
        
    except Exception as e:
        tqdm.write(f"✗ Batch {batch_num} error: {e}")
        continue

print("✓ All documents processed!")


Table documents might not exist yet: {'message': 'invalid input syntax for type uuid: ""', 'code': '22P02', 'hint': None, 'details': None}
Processing batch 1: documents 1 to 10
✓ Batch completed successfully
Processing batch 2: documents 11 to 20
✓ Batch completed successfully
Processing batch 3: documents 21 to 30
✓ Batch completed successfully
Processing batch 4: documents 31 to 40
✓ Batch completed successfully
Processing batch 5: documents 41 to 50
✓ Batch completed successfully
Processing batch 6: documents 51 to 60
✓ Batch completed successfully
Processing batch 7: documents 61 to 70
✓ Batch completed successfully
Processing batch 8: documents 71 to 80
✓ Batch completed successfully
Processing batch 9: documents 81 to 90
✓ Batch completed successfully
Processing batch 10: documents 91 to 100
✓ Batch completed successfully
Processing batch 11: documents 101 to 110
✓ Batch completed successfully
Processing batch 12: documents 111 to 120
✓ Batch completed successfully
Processing bat

In [12]:
test_queries = [
    "apple leaf spot disease symptoms",  
]

# %%
for query in test_queries:
    print(f"\n{'='*60}")
    print(f"Query: {query}")
    print(f"{'='*60}")
   
    # Simple similarity search (top 3 results)
    results = vector_store.similarity_search(query, k=10)
   
    for i, doc in enumerate(results, 1):
        doc_type = doc.metadata['doc_type']
        print(f"\nResult {i} ({doc_type}):")
       
        if doc_type == "product":
            print(f"Product: {doc.metadata['product_name']}")
            print(f"Group: {doc.metadata['group']}")
        else:
            print(f"Plant: {doc.metadata['plant_name']}")
            print(f"Section: {doc.metadata['section']}")
       
        print(f"Content: {doc.page_content[:300]}...")


Query: apple leaf spot disease symptoms

Result 1 (plant_info):
Plant: cherry (including sour)
Section: Diseases - Cherry leaf spot Coccomyces hiemalis
Content: - Symptoms: Small, red-purple spots on upper surfaces of leaves which turn brown and may coalesce; leaves may become chlorotic if there are a few lesions present; if tree becomes severely defoliated fruit may fail to develop properly and remain light in color and watery in texture  
- Cause: Fungus ...

Result 2 (plant_info):
Plant: cashew nuts
Section: Diseases - Anthracnose Colletotrichum gloeosporoides
Content: - Symptoms: Water-soaked lesions on leaves, twigs, flowers or young apples which develop into orange-brown or red lesions;  
- Cause: Fungus  
- Comments: Disease emergence favored by rainfall and high humidity  
- Management: Management A protective coating of copper-based fungicide on susceptible ...

Result 3 (plant_info):
Plant: quince
Section: Diseases - Leaf blight (Leaf fleck) Diplocarpon maculatum
Content: - 

In [13]:
def search_plant_info(query, k=3):
    """Search plant information only using metadata filter"""
    filter_dict = {"doc_type": "plant_info"}
    return vector_store.similarity_search(query, k=k, filter=filter_dict)

def search_products(query, k=3):
    """Search products only using metadata filter"""
    filter_dict = {"doc_type": "product"}
    results = vector_store.similarity_search(query, k=k, filter=filter_dict)
    return [
        {"content": doc.page_content, "metadata": doc.metadata}
        for doc in results
    ]

def search_disease_info(query, plant_name=None, k=3):
    """Search disease-specific info using metadata filter"""
    filter_dict = {"doc_type": "plant_info"}
    
    if plant_name:
        filter_dict["plant_name"] = plant_name
    
    return vector_store.similarity_search(query, k=k, filter=filter_dict)

def search_products_by_group(query, group_name, k=3):
    """Search products by specific group using metadata filter"""
    filter_dict = {
        "doc_type": "product",
        "group": group_name
    }
    return vector_store.similarity_search(query, k=k, filter=filter_dict)

def search_faq(query, k=3):
    """Search FAQ documents only using metadata filter"""
    filter_dict = {"doc_type": "faq"}
    return vector_store.similarity_search(query, k=k, filter=filter_dict)

In [None]:
# Test FAQ search
print(f"\n{'='*60}")
print("FAQ Search Test")
print(f"{'='*60}")

faq_test_queries = [
    "bagaimana cara menggunakan dr lensi",
    "apa itu lensfolia",
    "fitur utama lensfolia"
]

for query in faq_test_queries:
    print(f"\nFAQ Query: {query}")
    faq_results = search_faq(query, k=2)
    
    for i, doc in enumerate(faq_results, 1):
        print(f"\nFAQ Result {i}:")
        print(f"{doc.page_content[:300]}...")


FAQ Search Test

FAQ Query: bagaimana cara menggunakan dr lensi

FAQ Result 1:
Q: Bagaimana cara menggunakan Dr. Lensi?

A: 1. Ambil atau unggah gambar tanaman yang sakit.
2. Dr. Lensi akan menganalisis gambar dan memberikan hasil diagnosis.
3. Lihat ringkasan penyakit, tips perawatan, dan rekomendasi produk....

FAQ Result 2:
Q: Apakah Dr. Lensi bisa mendeteksi semua jenis tanaman?

A: Model klasifikasi gambar saat ini hanya mencakup beberapa jenis tanaman. Namun, Dr. Lensi tetap dapat memberikan informasi penyakit lainnya melalui pencarian dokumen dan web....

FAQ Query: apa itu lensfolia

FAQ Result 1:
Q: Apa itu LensFolia?

A: LensFolia adalah aplikasi diagnosis penyakit tanaman berbasis AI (Artificial Intelligence). Aplikasi ini membantu pengguna mengidentifikasi penyakit tanaman melalui gambar dan memberikan informasi lengkap, tips perawatan, serta rekomendasi produk....

FAQ Result 2:
Q: Apakah LensFolia gratis?

A: Sebagian besar fitur tersedia secara gratis. Namun, beberapa f

In [15]:
# Test search
print(f"\n{'='*60}")
print("Test Search Test")
print(f"{'='*60}")

test_queries = [
    "Insektisida untuk tanaman tomat",
]

for query in test_queries:
    print(f"\nQuery: {query}")
    product_results = search_products(query, k=2)
    
    for i, doc in enumerate(product_results, 1):
        print(f"\nResult {i}:")
        print(f"Product Name: {doc['metadata'].get('product_name', 'N/A')}")
        print(f"Group: {doc['metadata'].get('group', 'N/A')}")
        print(f"Content: {doc['content'][:300]}...")


Test Search Test

Query: Insektisida untuk tanaman tomat

Result 1:
Product Name: BENTO 50 EC
Group: insektisida
Content: ![Image of product: BENTO 50 EC](https://kresna.co.id/sarikresnakimia/wp-content/uploads/2014/11/Bento-50-EC-1000x1000-360x360.png)  
**Bahan Aktif:** Sipermetrin 50 g/l  
**Deskripsi:** Insektisida racun kontak dan lambung, berbentuk pekatan berwarna kuning yang dapat membentuk emulsi dalam air unt...

Result 2:
Product Name: MATARIN 50 EC
Group: insektisida
Content: ![Image of product: MATARIN 50 EC](https://kresna.co.id/sarikresnakimia/wp-content/uploads/2014/11/Matarin-50-EC-1000x1000-360x360.png)  
**Bahan Aktif:** Lamda sihalotrin 50 g/l  
**Deskripsi:** Insektisida racun kontak dan lambung berbentuk pekatan yang dapat di emulsikan dalam air, berwarna kunin...


In [11]:
# Agent 1 output simulation: "apple leaf spot detected"
detected_plant = "apple"
detected_disease = "leaf spot"

# Agent 2: Get plant disease info
print(f"\nAgent 2 - Plant disease info for {detected_plant} {detected_disease}:")
plant_results = search_disease_info(f"{detected_plant} {detected_disease}", detected_plant)

for doc in plant_results[:1]:
    print(f"Plant: {doc.metadata['plant_name']}")
    print(f"Content: {doc.page_content[:400]}...")

# Agent 3: Get product recommendations
print(f"\nAgent 3 - Product recommendations for {detected_disease}:")
product_results = search_products(f"fungicide {detected_disease}")

for doc in product_results[:1]:
    print(f"Product: {doc.metadata['product_name']}")
    print(f"Group: {doc.metadata['group']}")
    print(f"Content: {doc.page_content[:400]}...")


Agent 2 - Plant disease info for apple leaf spot:


KeyboardInterrupt: 