In [16]:
# Core imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr

# LangChain and ML imports
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader, UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

# Visualization imports (optional)
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go


In [17]:
# Model configuration - using cost-effective model
MODEL = "gpt-4o-mini"
db_name = "vector_db"

# Load environment variables
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

# Verify API key is loaded
if not os.getenv('OPENAI_API_KEY'):
    print("⚠️ Warning: OpenAI API key not found!")
else:
    print("✅ OpenAI API key loaded successfully")


✅ OpenAI API key loaded successfully


In [18]:
# Document loading functions for different platforms
def load_txt_folder(path, site_tag):
    """Load text files from a directory"""
    try:
        loader = DirectoryLoader(
            path=path,
            glob="**/*.txt",
            loader_cls=lambda p: TextLoader(p, encoding="utf-8")
        )
        docs = loader.load()
        for d in docs:
            d.metadata["source_site"] = site_tag
        print(f"✅ Loaded {len(docs)} TXT files from {site_tag}")
        return docs
    except Exception as e:
        print(f"❌ Error loading TXT from {path}: {str(e)}")
        return []

def load_html_folder(path, site_tag):
    """Load HTML files from a directory"""
    try:
        loader = DirectoryLoader(
            path=path,
            glob="**/*.html",
            loader_cls=UnstructuredHTMLLoader
        )
        docs = loader.load()
        for d in docs:
            d.metadata["source_site"] = site_tag
        print(f"✅ Loaded {len(docs)} HTML files from {site_tag}")
        return docs
    except Exception as e:
        print(f"❌ Error loading HTML from {path}: {str(e)}")
        return []

def load_pdf_folder(path, site_tag):
    """Load PDF files from a directory"""
    try:
        loader = DirectoryLoader(
            path=path,
            glob="**/*.pdf",
            loader_cls=PyPDFLoader
        )
        docs = loader.load()
        for d in docs:
            d.metadata["source_site"] = site_tag
        print(f"✅ Loaded {len(docs)} PDF files from {site_tag}")
        return docs
    except Exception as e:
        print(f"❌ Error loading PDF from {path}: {str(e)}")
        return []


In [19]:
# Root folders – create directories if they don't exist
ROOT_MOSDAC_PDF = "data_ingestion/mosdac_data/documents"
ROOT_MOSDAC_WEB = "data_ingestion/mosdac_data/web_pages"
ROOT_VEDAS_WEB = "data_ingestion/vedas_data/web_pages"
ROOT_BHUVAN_WEB = "data_ingestion/bhuvan_data/web_pages"

# Create directories if they don't exist
directories = [ROOT_MOSDAC_PDF, ROOT_MOSDAC_WEB, ROOT_VEDAS_WEB, ROOT_BHUVAN_WEB]
for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"📁 Directory ready: {directory}")

# Load documents from all platforms
print("\n🔄 Loading documents from all platforms...")

mosdac_pdfs = load_pdf_folder(ROOT_MOSDAC_PDF, "mosdac")
mosdac_txt = load_txt_folder(ROOT_MOSDAC_WEB, "mosdac")
mosdac_html = load_html_folder(ROOT_MOSDAC_WEB, "mosdac")

vedas_txt = load_txt_folder(ROOT_VEDAS_WEB, "vedas")
vedas_html = load_html_folder(ROOT_VEDAS_WEB, "vedas")

bhuvan_txt = load_txt_folder(ROOT_BHUVAN_WEB, "bhuvan")
bhuvan_html = load_html_folder(ROOT_BHUVAN_WEB, "bhuvan")

# Combine all documents
documents_all = mosdac_pdfs + mosdac_txt + mosdac_html + vedas_txt + vedas_html + bhuvan_txt + bhuvan_html

print(f"\n📊 Total documents loaded: {len(documents_all)}")
if documents_all:
    print("📋 Sample metadata:", documents_all[0].metadata)
    
    # Count by platform
    platform_counts = {}
    for doc in documents_all:
        platform = doc.metadata.get('source_site', 'unknown')
        platform_counts[platform] = platform_counts.get(platform, 0) + 1
    
    print("📈 Documents by platform:")
    for platform, count in platform_counts.items():
        print(f"   {platform.upper()}: {count} documents")
else:
    print("⚠️ No documents found! Please add files to the data directories.")


📁 Directory ready: data_ingestion/mosdac_data/documents
📁 Directory ready: data_ingestion/mosdac_data/web_pages
📁 Directory ready: data_ingestion/vedas_data/web_pages
📁 Directory ready: data_ingestion/bhuvan_data/web_pages

🔄 Loading documents from all platforms...


Multiple definitions in dictionary at byte 0x30f8cb for key /Im1027
Multiple definitions in dictionary at byte 0x30f8dc for key /Im1027


✅ Loaded 1265 PDF files from mosdac
✅ Loaded 220 TXT files from mosdac
✅ Loaded 0 HTML files from mosdac
✅ Loaded 233 TXT files from vedas
✅ Loaded 0 HTML files from vedas
✅ Loaded 6 TXT files from bhuvan
✅ Loaded 0 HTML files from bhuvan

📊 Total documents loaded: 1724
📋 Sample metadata: {'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2012-03-20T14:49:18-05:30', 'moddate': '2012-03-20T14:49:18-05:30', 'title': 'Microsoft Word - Analysed-Winds', 'author': 'admin', 'source': 'data_ingestion\\mosdac_data\\documents\\Analysed-Winds.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1', 'source_site': 'mosdac'}
📈 Documents by platform:
   MOSDAC: 1485 documents
   VEDAS: 233 documents
   BHUVAN: 6 documents


In [20]:
# Text splitting with optimized parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
)

print("🔄 Splitting documents into chunks...")
chunks_all = text_splitter.split_documents(documents_all)
print(f"📊 Total chunks created: {len(chunks_all)}")

# Analyze chunks by platform
if chunks_all:
    chunk_counts = {}
    for chunk in chunks_all:
        platform = chunk.metadata.get('source_site', 'unknown')
        chunk_counts[platform] = chunk_counts.get(platform, 0) + 1
    
    print("📈 Chunks by platform:")
    for platform, count in chunk_counts.items():
        print(f"   {platform.upper()}: {count} chunks")
    
    # Show sample chunk
    first_chunk = chunks_all[0]
    print(f"\n📋 Sample chunk metadata: {first_chunk.metadata}")
    print(f"📝 Sample content preview: {first_chunk.page_content[:200]}...")


🔄 Splitting documents into chunks...
📊 Total chunks created: 4596
📈 Chunks by platform:
   MOSDAC: 3956 chunks
   VEDAS: 634 chunks
   BHUVAN: 6 chunks

📋 Sample chunk metadata: {'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2012-03-20T14:49:18-05:30', 'moddate': '2012-03-20T14:49:18-05:30', 'title': 'Microsoft Word - Analysed-Winds', 'author': 'admin', 'source': 'data_ingestion\\mosdac_data\\documents\\Analysed-Winds.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1', 'source_site': 'mosdac'}
📝 Sample content preview: GLOBAL ANALYSED OCEAN SURFACE WIND PRODUCTS 
 
 
Description 
The analysed winds have been generated at 0.5 
0×0.5 0 interval over the global oceans. For the generation of 
these analysed winds produc...


In [21]:
# Initialize embeddings and vector store
print("🔄 Setting up vector store...")

embeddings = OpenAIEmbeddings()

# Optional: start from a clean collection if you want to rebuild
if os.path.exists(db_name):
    print("🗑️ Removing existing vector store...")
    try:
        old_store = Chroma(persist_directory=db_name, embedding_function=embeddings)
        old_store.delete_collection()
        print("✅ Old vector store removed")
    except Exception as e:
        print(f"⚠️ Could not remove old store: {e}")

print("🔄 Creating new vector store...")
vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

# Batch insert with progress tracking
if chunks_all:
    BATCH_SIZE = 64
    total_chunks = len(chunks_all)
    
    print(f"📊 Inserting {total_chunks} chunks in batches of {BATCH_SIZE}...")
    
    for i in range(0, total_chunks, BATCH_SIZE):
        batch = chunks_all[i:i+BATCH_SIZE]
        vectorstore.add_documents(batch)
        
        progress = min(i + BATCH_SIZE, total_chunks)
        percentage = (progress / total_chunks) * 100
        print(f"✅ Progress: {progress}/{total_chunks} chunks ({percentage:.1f}%)")
    
    final_count = vectorstore._collection.count()
    print(f"🎉 Vector store created successfully!")
    print(f"📊 Final collection size: {final_count}")
else:
    print("❌ No chunks to insert into vector store!")


🔄 Setting up vector store...
🗑️ Removing existing vector store...
✅ Old vector store removed
🔄 Creating new vector store...
📊 Inserting 4596 chunks in batches of 64...
✅ Progress: 64/4596 chunks (1.4%)
✅ Progress: 128/4596 chunks (2.8%)
✅ Progress: 192/4596 chunks (4.2%)
✅ Progress: 256/4596 chunks (5.6%)
✅ Progress: 320/4596 chunks (7.0%)
✅ Progress: 384/4596 chunks (8.4%)
✅ Progress: 448/4596 chunks (9.7%)
✅ Progress: 512/4596 chunks (11.1%)
✅ Progress: 576/4596 chunks (12.5%)
✅ Progress: 640/4596 chunks (13.9%)
✅ Progress: 704/4596 chunks (15.3%)
✅ Progress: 768/4596 chunks (16.7%)
✅ Progress: 832/4596 chunks (18.1%)
✅ Progress: 896/4596 chunks (19.5%)
✅ Progress: 960/4596 chunks (20.9%)
✅ Progress: 1024/4596 chunks (22.3%)
✅ Progress: 1088/4596 chunks (23.7%)
✅ Progress: 1152/4596 chunks (25.1%)
✅ Progress: 1216/4596 chunks (26.5%)
✅ Progress: 1280/4596 chunks (27.9%)
✅ Progress: 1344/4596 chunks (29.2%)
✅ Progress: 1408/4596 chunks (30.6%)
✅ Progress: 1472/4596 chunks (32.0%)
✅ Pr

In [22]:
# Initialize LLM and retriever
print("🔄 Setting up RAG chain...")

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Enhanced prompt template
rag_prompt_template = """You are ASTROGEO AI, a specialized assistant for Indian space and earth observation data platforms.

You have access to information from three major platforms:
- MOSDAC: Meteorological & Oceanographic Satellite Data Archival Centre
- VEDAS: Visualization of Earth observation Data and Archival System  
- BHUVAN: Indian Geo-platform for visualization and analysis

Instructions:
1. Answer questions based on the provided context documents
2. If information is available in the documents, provide detailed, comprehensive responses
3. Always mention which platform(s) the information comes from (MOSDAC, VEDAS, or BHUVAN)
4. If documents don't contain the answer, use general knowledge but clearly state this
5. Be helpful, accurate, and technical when appropriate
6. For data access questions, provide specific steps and requirements

Context from documents:
{context}

Chat History:
{chat_history}

User Question: {question}

Response:"""

RAG_PROMPT = PromptTemplate.from_template(rag_prompt_template)

# Create the conversational RAG chain
rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": RAG_PROMPT},
    return_source_documents=True
)

print("✅ RAG chain setup complete!")


🔄 Setting up RAG chain...
✅ RAG chain setup complete!


In [23]:
def is_relevant(question, docs):
    """Check if retrieved documents are relevant to the question"""
    if not docs:
        return False
    
    # Take the first document for relevance check
    doc_content = docs[0].page_content[:1500] if docs else ""
    
    relevance_prompt = f"""Analyze if the following document content is relevant to answer the user's question.
    Answer only "Yes" or "No".
    
    Question: {question}
    
    Document Content: {doc_content}
    
    Relevance Assessment:"""
    
    try:
        check = llm.invoke(relevance_prompt)
        return "Yes" in check.content
    except Exception as e:
        print(f"⚠️ Relevance check error: {e}")
        return True  # Default to using RAG if check fails

# Fallback prompt for general knowledge responses
fallback_prompt_template = """You are ASTROGEO AI, a helpful assistant specializing in Indian space and earth observation platforms (MOSDAC, VEDAS, BHUVAN).

The user's question doesn't seem to be directly related to the documents in your knowledge base, so please answer using your general knowledge while staying within your area of expertise.

If the question is about:
- Satellite data, remote sensing, or earth observation
- Indian space programs (ISRO)
- Weather and climate data
- GIS and mapping
- Data access and processing

Please provide a helpful response. If it's completely outside your domain, politely redirect to your specializations.

Question: {question}

Response:"""

FALLBACK_PROMPT = PromptTemplate.from_template(fallback_prompt_template)

def smart_bot(message, history):
    """Main chatbot function with smart routing"""
    try:
        # Retrieve relevant documents
        docs = retriever.get_relevant_documents(message)
        
        # Check relevance and route accordingly
        if docs and is_relevant(message, docs):
            # Use RAG chain for document-based response
            result = rag_chain.invoke({"question": message})
            
            # Add source information
            sources = set()
            if hasattr(result, 'source_documents') and result['source_documents']:
                for doc in result['source_documents']:
                    source_site = doc.metadata.get('source_site', 'unknown')
                    sources.add(source_site.upper())
            
            response = result["answer"]
            if sources:
                response += f"\n\n*Sources: {', '.join(sources)}*"
            
            return response
        else:
            # Use fallback for general knowledge
            chain = FALLBACK_PROMPT | llm
            result = chain.invoke({"question": message})
            return result.content + "\n\n*Note: This response is based on general knowledge as no relevant documents were found in the knowledge base.*"
            
    except Exception as e:
        return f"❌ Sorry, I encountered an error: {str(e)}. Please try rephrasing your question."

print("✅ Smart bot function ready!")


✅ Smart bot function ready!


In [24]:
# Create and launch the Gradio interface
print("🚀 Launching ASTROGEO AI Chat Interface...")

# Custom CSS for better appearance
custom_css = """
.gradio-container {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important;
}
"""

# Create the chat interface
interface = gr.ChatInterface(
    fn=smart_bot,
    title="🛰️ ASTROGEO AI — Multi-Platform Earth Observation Assistant",
    description="""
    **Powered by MOSDAC + VEDAS + BHUVAN Knowledge Base**
    
    Ask me about:
    • MOSDAC satellite data and services
    • VEDAS earth observation and visualization
    • BHUVAN geospatial data and mapping
    • Data access procedures and formats
    • Indian space and remote sensing programs
    """,
    theme="soft",
    css=custom_css,
    examples=[
        "What types of data are available in MOSDAC?",
        "How do I access satellite imagery from VEDAS?",
        "What mapping services does BHUVAN provide?",
        "How can I download INSAT-3D data?",
        "What are the data formats supported by these platforms?",
        "Tell me about Indian earth observation satellites"
    ],
    cache_examples=False,
    analytics_enabled=False
)

# Launch the interface
try:
    interface.launch(
        share=True,  # Creates public link
        server_name="0.0.0.0",  # Makes it accessible on network
        server_port=7860,  # Default Gradio port
        show_error=True
    )
    print("✅ Interface launched successfully!")
except Exception as e:
    print(f"❌ Launch error: {e}")
    # Fallback launch without share
    interface.launch(show_error=True)


🚀 Launching ASTROGEO AI Chat Interface...


  self.chatbot = Chatbot(


❌ Launch error: Cannot find empty port in range: 7860-7860. You can specify a different port by setting the GRADIO_SERVER_PORT environment variable or passing the `server_port` parameter to `launch()`.
* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


In [25]:
# Test the system with sample queries
def test_system():
    """Test the chatbot with sample queries"""
    test_queries = [
        "What is MOSDAC?",
        "How do I access VEDAS data?",
        "What services does BHUVAN provide?",
        "Tell me about INSAT satellites",
        "What is the weather like today?"  # This should trigger fallback
    ]
    
    print("🧪 Testing the system with sample queries...")
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n{'='*50}")
        print(f"Test {i}: {query}")
        print('='*50)
        
        try:
            response = smart_bot(query, [])
            print(f"Response: {response[:300]}...")
        except Exception as e:
            print(f"❌ Error: {e}")

# Uncomment the line below to run tests
# test_system()

print("\n🎉 ASTROGEO AI is ready!")
print("📊 System Statistics:")
print(f"   • Total documents: {len(documents_all)}")
print(f"   • Total chunks: {len(chunks_all)}")
print(f"   • Vector store size: {vectorstore._collection.count()}")
print(f"   • Model: {MODEL}")



🎉 ASTROGEO AI is ready!
📊 System Statistics:
   • Total documents: 1724
   • Total chunks: 4596
   • Vector store size: 4596
   • Model: gpt-4o-mini


  docs = retriever.get_relevant_documents(message)


In [26]:
##heyy