In [8]:
pip install langchain langchain-openai langchain-community pymongo python-dotenv sentence-transformers faiss-cpu rank_bm25 tavily-python

Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.30-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-win_amd64.whl.metadata (5.2 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting tavily-python
  Downloading tavily_python-0.7.10-py3-none-any.whl.metadata (7.5 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Downloading langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.14-py3-none-any.whl.metadata (14 kB)
Collecting SQLAlchemy<3,>=1.4 (from langc

In [1]:
import requests
import pandas as pd
import os
import json
from pymongo import MongoClient
from dotenv import load_dotenv

In [7]:
import requests
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables (make sure you have a .env file)
load_dotenv()

# 1. Define the API endpoint and parameters
# FIX: The base_url is the actual API endpoint address
base_url = "https://api.data.gov.in/resource/4554a3c8-74e3-4f93-8727-8fd92161e345"

# FIX: The api_key is loaded from your environment variable
api_key = os.getenv("SOIL_MOISTURE_API_KEY") 

# Check if the API key was loaded correctly
if not api_key:
    print("❌ Error: SOIL_MOISTURE_API_KEY not found in environment variables.")
    print("Please make sure you have a .env file with the key.")
else:
    params = {
        "api-key": api_key, # Use the key here
        "format": "json",
        "filters[State]": "Uttar Pradesh",
        "filters[District]": "Kanpur Nagar",
        "filters[Year]": "2025",
        "filters[Month]": "06",
        "limit": "100" 
    }

    # 2. Make the request to the API
    try:
        # FIX: Use the correct base_url for the request
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # This will raise an error for bad responses (4xx or 5xx)

        # 3. Process and display the data
        data = response.json()
        
        if data.get('records'):
            df = pd.DataFrame(data['records'])
            print("💧 Daily Soil Moisture Data for Kanpur Nagar, June 2025")
            print(df[['Date', 'Avg_smlvl_at15cm']])
        else:
            print("No records found for Kanpur Nagar in June 2025.")
            print("Please check if the district name is spelled correctly (e.g., 'Kanpur').")

    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

💧 Daily Soil Moisture Data for Kanpur Nagar, June 2025
          Date  Avg_smlvl_at15cm
0   2025-06-08          9.394210
1   2025-06-14          8.799874
2   2025-06-21         19.762478
3   2025-06-29         20.766949
4   2025-06-05          9.822603
5   2025-06-06          9.672238
6   2025-06-13          8.884343
7   2025-06-18          9.538058
8   2025-06-20         21.901679
9   2025-06-24         20.022704
10  2025-06-26         18.265095
11  2025-06-07          9.531424
12  2025-06-11          9.057092
13  2025-06-19          9.421376
14  2025-06-27         17.463964
15  2025-06-28         16.808421
16  2025-06-30         21.395722
17  2025-06-03         10.147682
18  2025-06-04          9.975444
19  2025-06-09          9.271029
20  2025-06-15          9.852208
21  2025-06-22         20.299899
22  2025-06-23         20.329713
23  2025-06-01         10.448284
24  2025-06-02         10.311090
25  2025-06-10          9.159857
26  2025-06-12          8.964108
27  2025-06-16       

In [None]:
import os
import json
import logging
from pymongo import MongoClient
from dotenv import load_dotenv

# LangChain components
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from pydantic import BaseModel, Field

# --- 1. CONFIGURATION & SETUP ---
load_dotenv()

# Configure logging with UTF-8 encoding
file_handler = logging.FileHandler("rag_log.txt", encoding="utf-8")
stream_handler = logging.StreamHandler()
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(message)s',
                    handlers=[file_handler, stream_handler])

# MongoDB Connection
MONGO_CONNECTION_STRING = os.getenv("MONGODB_URI")
DB_NAME = "soil_and_health"
CLAUSE_COLLECTION_NAME = "clauses"

# Models
OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small'
LLM_MODEL = "gpt-4o"

# --- 2. DATA LOADING AND INDEXING ---
def load_data_from_mongo(collection):
    """Loads all documents from a MongoDB collection."""
    logging.info(f"Loading documents from '{collection.name}' collection...")
    return list(collection.find({}, {"clause_text": 1, "metadata": 1, "_id": 0}))

def setup_retrievers(documents):
    """Creates a hybrid retriever (Dense + Sparse) from the documents."""
    logging.info("Setting up retrievers...")
    
    for doc_dict in documents:
        doc_dict['page_content'] = doc_dict.pop('clause_text', '')

    langchain_documents = [Document(page_content=d['page_content'], metadata=d['metadata']) for d in documents]

    bm25_retriever = BM25Retriever.from_documents(langchain_documents)
    bm25_retriever.k = 5 

    embedding_function = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL)
    vectorstore = FAISS.from_documents(langchain_documents, embedding_function)
    faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, faiss_retriever],
        weights=[0.5, 0.5] 
    )
    logging.info("✅ Hybrid retriever is ready.")
    return ensemble_retriever

# --- 3. RAG CHAIN COMPONENTS ---

class RefinedQuery(BaseModel):
    db_query: str = Field(description="A detailed, rewritten query for a vector database search.")
    web_query: str = Field(description="A concise query for a web search engine, or an empty string if not needed.")

query_refiner_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at rewriting user queries for a vector database and a web search engine. "
             "Based on the user query, generate a database query and a web search query to retrieve relevant information.\n"
             "{format_instructions}"),
    ("user", "User Query: {query}")
])

web_search_tool = TavilySearchResults(k=3)

final_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert agricultural assistant. Answer the user's question based *only* on the provided context from our database and web search results.\n"
             "If the context does not contain the answer, state that clearly. Be concise and helpful.\n\n"
             "--- CONTEXT --- \n{context}"),
    ("user", "Question: {query}")
])

# --- 4. MAIN EXECUTION ---
if __name__ == "__main__":
    try:
        client = MongoClient(MONGO_CONNECTION_STRING)
        db = client[DB_NAME]
        clauses_collection = db[CLAUSE_COLLECTION_NAME]
        documents = load_data_from_mongo(clauses_collection)
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB or load data: {e}")
        exit()

    if not documents:
        logging.error("No documents loaded from the database. Halting.")
        exit()

    hybrid_retriever = setup_retrievers(documents)
    llm = ChatOpenAI(model=LLM_MODEL, temperature=0)
    
    # --- Build the Full RAG Chain (Corrected Version) ---
    
    # 1. Define the query refiner chain
    query_parser = JsonOutputParser(pydantic_object=RefinedQuery)
    refiner_prompt_with_instructions = query_refiner_prompt.partial(format_instructions=query_parser.get_format_instructions())
    refiner_chain = refiner_prompt_with_instructions | llm | query_parser

    # 2. Define the retrieval function
    def retrieve_context(state: dict):
        """Combines DB retrieval and web search based on the refined queries."""
        db_query = state['refined_queries']['db_query']
        web_query = state['refined_queries']['web_query']
        
        db_context_docs = hybrid_retriever.invoke(db_query)
        db_context = "\n\n".join([doc.page_content for doc in db_context_docs])
        
        web_context = ""
        if web_query:
            logging.info(f"Performing web search for: '{web_query}'")
            web_results = web_search_tool.invoke(web_query)
            if web_results:
                web_context = "\n\n".join([item['content'] for item in web_results])
        
        return f"--- From Database ---\n{db_context}\n\n--- From Web Search ---\n{web_context}"

    # 3. Assemble the full chain using a more robust method
    full_rag_chain = (
        RunnablePassthrough.assign(refined_queries=refiner_chain)
        | RunnablePassthrough.assign(context=retrieve_context)
        | RunnableLambda(lambda x: logging.info(f"--- LOGGING CONTEXT ---\nQuery: {x['query']}\nContext: {x['context']}\n--- END LOG ---") or x)
        | final_prompt
        | llm
        | StrOutputParser()
    )

    # --- 5. RUN AN EXAMPLE ---
    logging.info("\n" + "="*50 + "\n🚀 RAG System Ready. Ask a question.\n" + "="*50)
    
    user_question = "What is Azotobacter used in? for what crops and where are those crops grown?"
    
    # The input to the chain is now a dictionary
    final_answer = full_rag_chain.invoke({"query": user_question})
    
    print("\n" + "="*50 + "\n✅ FINAL ANSWER:\n" + "="*50)
    print(final_answer)

2025-08-17 00:09:33,825 - Loading documents from 'clauses' collection...
2025-08-17 00:09:35,327 - Setting up retrievers...
2025-08-17 00:09:41,641 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-17 00:09:44,910 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-17 00:09:46,362 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\adwik\AppData\Local\Programs\Python\Python311\Lib\logging\__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\adwik\AppData\Local\Programs\Python\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 26: character maps to <undefined>
Call stack:



✅ FINAL ANSWER:
Azotobacter is used as a biofertilizer due to its ability to fix atmospheric nitrogen, produce growth hormones, and solubilize phosphate, which improves plant growth and soil fertility. It is commonly found in the rhizosphere of various non-legume crops, including cotton, wheat, rice, and vegetables. These crops are grown in a variety of regions, as Azotobacter strains have been isolated from neutral to alkaline soils worldwide. Specifically, it has been reported in the rhizosphere of crops such as maize, sugarcane, rice, wheat, bajra, millets, plantation crops, and vegetables.


: 