In [1]:
# Core Python imports
import os
from dotenv import load_dotenv
#version1223

# Core ML/Data imports
import gradio as gr
import numpy as np

# LangChain - RAG knowledge base
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader, UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

print("✅ Core libraries imported.")


  from .autonotebook import tqdm as notebook_tqdm


✅ Core libraries imported.


In [None]:
# Set API Keys - Using HF GPT-OSS (Free) instead of OpenAI
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

# ✅ Read Hugging Face Token directly from .env
hf_key = os.getenv("HF_TOKEN")  # Make sure .env has HF_TOKEN=your_token

# Optionally set it to os.environ under the name Hugging Face expects
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_key  

print("✅ Using Hugging Face GPT-OSS 120B (Free)")

# Verify HF token
if hf_key and hf_key.startswith("hf_"):
    print(f"✅ Hugging Face Token set: {hf_key[:10]}...{hf_key[-4:]}")
else:
    print("⚠️ Hugging Face Token invalid or missing! Check your .env file.")


✅ Using Hugging Face GPT-OSS 120B (Free)
✅ Hugging Face Token set: hf_GNQRSqB...mvKy


In [4]:
# Directory structure setup
DATA_DIRS = {
    "mosdac_pdf": "data_ingestion/mosdac_data/documents",
    "mosdac_web": "data_ingestion/mosdac_data/web_pages",
    "vedas_web": "data_ingestion/vedas_data/web_pages",
    "bhuvan_web": "data_ingestion/bhuvan_data/web_pages",
}

# Ensure directories exist
for d in DATA_DIRS.values():
    os.makedirs(d, exist_ok=True)

print("📁 Data directories ready.")

# Define universal loader
def load_docs(path, glob_pattern, loader_func, tag):
    try:
        loader = DirectoryLoader(path=path, glob=glob_pattern, loader_cls=loader_func)
        docs = loader.load()
        for d in docs:
            d.metadata["source_site"] = tag
        print(f"✅ Loaded {len(docs)} files from {tag}.")
        return docs
    except Exception as e:
        print(f"❌ Error loading from {path}: {str(e)}")
        return []


📁 Data directories ready.


In [6]:
# Load each type by platform
mosdac_pdfs = load_docs(DATA_DIRS['mosdac_pdf'], "**/*.pdf", PyPDFLoader, "mosdac")
mosdac_txt = load_docs(DATA_DIRS['mosdac_web'], "**/*.txt", lambda p: TextLoader(p, encoding="utf-8"), "mosdac")
mosdac_html = load_docs(DATA_DIRS['mosdac_web'], "**/*.html", UnstructuredHTMLLoader, "mosdac")
vedas_txt = load_docs(DATA_DIRS['vedas_web'], "**/*.txt", lambda p: TextLoader(p, encoding="utf-8"), "vedas")
vedas_html = load_docs(DATA_DIRS['vedas_web'], "**/*.html", UnstructuredHTMLLoader, "vedas")
bhuvan_txt = load_docs(DATA_DIRS['bhuvan_web'], "**/*.txt", lambda p: TextLoader(p, encoding="utf-8"), "bhuvan")
bhuvan_html = load_docs(DATA_DIRS['bhuvan_web'], "**/*.html", UnstructuredHTMLLoader, "bhuvan")

# Combine
documents_all = mosdac_pdfs + mosdac_txt + mosdac_html + vedas_txt + vedas_html + bhuvan_txt + bhuvan_html
print(f"📊 Total documents loaded: {len(documents_all)}")


Multiple definitions in dictionary at byte 0x30f8cb for key /Im1027
Multiple definitions in dictionary at byte 0x30f8dc for key /Im1027


✅ Loaded 1265 files from mosdac.
✅ Loaded 220 files from mosdac.
✅ Loaded 0 files from mosdac.
✅ Loaded 233 files from vedas.
✅ Loaded 0 files from vedas.
✅ Loaded 6 files from bhuvan.
✅ Loaded 0 files from bhuvan.
📊 Total documents loaded: 1724


In [7]:
# Split into chunks for vector DB
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
chunks_all = splitter.split_documents(documents_all)
print(f"📊 Total chunks created: {len(chunks_all)}")


📊 Total chunks created: 4596


In [8]:
# Cell 6: Vector Store with RAG - WINDOWS PERMISSION FIX
import shutil
import os
import stat
from huggingface_hub import InferenceClient

# Step 1: Windows-compatible folder deletion
def force_delete_folder(path):
    """Force delete folder on Windows with permission fixes"""
    def handle_remove_readonly(func, path, exc):
        if os.path.exists(path):
            os.chmod(path, stat.S_IWRITE)
            func(path)
    
    try:
        if os.path.exists(path):
            shutil.rmtree(path, onerror=handle_remove_readonly)
            print(f"🗑️ Successfully deleted {path}")
        else:
            print(f"📁 {path} doesn't exist - creating fresh")
    except Exception as e:
        print(f"⚠️ Manual deletion needed: {e}")
        print("💡 Solution: Restart your Jupyter kernel, then run this cell again")

# Delete old vector DB
db_name = "vector_db"
force_delete_folder(db_name)

# Step 2: Custom HF Embeddings with consistent dimensions
class HFEmbeddings:
    def __init__(self, client):
        self.client = client
        self.model = "sentence-transformers/all-mpnet-base-v2"  # 768 dimensions
        
    def embed_documents(self, texts):
        embeddings = []
        print(f"📊 Processing {len(texts)} documents for embeddings...")
        
        for i, text in enumerate(texts):
            try:
                # Extract page_content if it's a Document object
                content = text.page_content if hasattr(text, 'page_content') else str(text)
                content = content[:500]  # Limit text length for free tier
                
                # Get embedding from HF
                emb = self.client.feature_extraction(content, model=self.model)
                
                # Handle response format
                if isinstance(emb, list) and len(emb) > 0:
                    embedding = emb[0] if isinstance(emb[0], list) else emb
                else:
                    embedding = [0.1] * 768  # Fallback
                    
                embeddings.append(embedding)
                
                if (i + 1) % 10 == 0:
                    print(f"✅ Processed {i + 1}/{len(texts)} embeddings")
                    
            except Exception as e:
                print(f"⚠️ Error for text {i}: {e}")
                embeddings.append([0.1] * 768)  # Fallback
                
        return embeddings
        
    def embed_query(self, text):
        try:
            text = text[:500]  # Limit length
            emb = self.client.feature_extraction(text, model=self.model)
            return emb[0] if isinstance(emb, list) and isinstance(emb, list) else emb
        except:
            return [0.1] * 768

# Step 3: Initialize client and embeddings
client = InferenceClient(token=os.getenv('HF_TOKEN'))
embeddings = HFEmbeddings(client)

# Step 4: Create fresh vector store
try:
    vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
    print("✅ Fresh vector store created")
    
    # Step 5: Add your MOSDAC/VEDAS/BHUVAN documents in batches
    if 'chunks_all' in globals() and chunks_all:
        BATCH_SIZE = 10  # Small batches for free tier
        total_chunks = min(100, len(chunks_all))  # Limit for testing
        
        print(f"📊 Adding {total_chunks} MOSDAC/VEDAS/BHUVAN chunks...")
        
        for i in range(0, total_chunks, BATCH_SIZE):
            batch = chunks_all[i:i+BATCH_SIZE]
            
            try:
                vectorstore.add_documents(batch)
                progress = min(i + BATCH_SIZE, total_chunks)
                print(f"✅ Added batch {i//BATCH_SIZE + 1}: {progress}/{total_chunks}")
                
            except Exception as e:
                print(f"❌ Batch {i//BATCH_SIZE + 1} failed: {e}")
                continue
        
        print(f"🎉 Vector store created with {total_chunks} MOSDAC/VEDAS/BHUVAN documents!")
        
    else:
        print("❌ No chunks_all found. Please run document loading cells first.")
        
except Exception as e:
    print(f"❌ Vector store creation failed: {e}")
    print("💡 Try restarting your kernel and running this cell again")


🗑️ Successfully deleted vector_db
✅ Fresh vector store created
📊 Adding 100 MOSDAC/VEDAS/BHUVAN chunks...
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 1: 10/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 2: 20/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 3: 30/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 4: 40/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 5: 50/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 6: 60/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 7: 70/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 8: 80/100
📊 Processing 10 documents for embeddings...
✅ Processed 10/10 embeddings
✅ Added batch 9: 90/100
📊 Processing 10 docum

In [9]:
# Setup FREE GPT-OSS 120B from Hugging Face
from huggingface_hub import InferenceClient

class FreeGPTOSS:
    def __init__(self, hf_token):
        self.client = InferenceClient(token=hf_token)
        self.model = "openai/gpt-oss-120b"  # Free GPT-OSS model
        
    def invoke(self, prompt):
        try:
            response = self.client.text_generation(
                prompt=prompt,
                model=self.model,
                max_new_tokens=500,
                temperature=0.7
            )
            # Create response object similar to OpenAI format
            class Response:
                def __init__(self, content):
                    self.content = content
            return Response(response)
        except Exception as e:
            return Response(f"Error: {str(e)}")

# Initialize free LLM
llm = FreeGPTOSS(os.getenv('HF_TOKEN'))
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Simplified RAG without LangChain memory (to avoid OpenAI dependency)
def simple_rag_query(question):
    try:
        docs = retriever.get_relevant_documents(question)
        context = "\n".join([doc.page_content for doc in docs[:3]])
        
        prompt = f"""You are ASTROGEO AI, a specialist in Indian space and earth observation platforms.

Context from documents:
{context}

User Question: {question}

Provide a detailed response based on the context above."""
        
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        return f"Error: {str(e)}"

print("✅ Free GPT-OSS 120B setup complete!")


✅ Free GPT-OSS 120B setup complete!


In [10]:
# Cell 8: FASTEST Free Space Image Generator - REPLACE YOUR CURRENT ONE
from huggingface_hub import InferenceClient

class UltraFastImageGen:
    def __init__(self, hf_token):
        self.client = InferenceClient(token=hf_token)
        self.model = "runwayml/stable-diffusion-v1-5"  # PROVEN working free model
    
    def generate(self, prompt):
        # Auto-enhance for space images
        enhanced = f"{prompt}, space photography, ultra detailed, cinematic, 8K, photorealistic"
        
        try:
            response = self.client.text_to_image(
                prompt=enhanced[:400],  # Keep short for speed
                model=self.model
            )
            return response, "✅ Generated with Stable Diffusion v1.5 (FREE)"
        except Exception as e:
            return None, f"❌ Failed: {str(e)}"

# REPLACE your image_generator with this:
image_generator = UltraFastImageGen(os.getenv('HF_TOKEN'))
print("✅ Ultra-fast free image generator ready!")


✅ Ultra-fast free image generator ready!


In [11]:
# Speech-to-Text using Hugging Face Whisper
from huggingface_hub import InferenceClient

class AstroGeoASR:
    def __init__(self, hf_token):
        self.client = InferenceClient(token=hf_token)
        self.models = {
            "whisper": "openai/whisper-large-v3",
            "whisper_small": "openai/whisper-small"
        }
        
    def transcribe(self, audio_file, model="whisper"):
        try:
            with open(audio_file, "rb") as f:
                result = self.client.automatic_speech_recognition(
                    data=f.read(),
                    model=self.models[model]
                )
            return result.get("text", "No transcription available")
        except Exception as e:
            return f"❌ ASR failed: {str(e)}"

asr = AstroGeoASR(os.getenv('HF_TOKEN'))
print("✅ Speech-to-Text (Whisper) ready")


✅ Speech-to-Text (Whisper) ready


In [12]:
# Cell 10: Smart Bot with Conversational Style
def smart_bot(message, history, asr_result=None):
    """Smart conversational bot - natural responses like a normal chatbot"""
    
    # Initialize response
    response = ""
    
    try:
        # Add ASR transcription if available
        if asr_result:
            response += f"[From audio]: {asr_result}\n\n"
        
        # Use conversational format with improved prompt
        from huggingface_hub import InferenceClient
        client = InferenceClient(token=os.getenv('HF_TOKEN'))
        
        # SMART CONVERSATIONAL SYSTEM PROMPT
        completion = client.chat.completions.create(
            model="openai/gpt-oss-120b",
            messages=[
                {
                    "role": "system", 
                    "content": """You are ASTROGEO AI, a smart and knowledgeable assistant specializing in space technology and earth observation. 

Instructions for responses:
- Be conversational and friendly, like a smart expert friend
- Use natural paragraph format, avoid tables or bullet points
- Give comprehensive but readable explanations
- Include technical details when relevant but explain them clearly
- Be specific and accurate about MOSDAC, VEDAS, BHUVAN, satellites, and space technology
- Sound professional yet approachable"""
                },
                {
                    "role": "user", 
                    "content": message
                }
            ],
            max_tokens=400,
            temperature=0.7
        )
        
        # Extract natural response
        ai_response = completion.choices[0].message.content
        response += ai_response
        
        return response
        
    except Exception as e:
        return f"I'm ASTROGEO AI, your smart space technology assistant. I can help with questions about satellites, MOSDAC, VEDAS, BHUVAN, and space observation. Error: {str(e)}"

print("✅ Updated to conversational smart bot style!")
print("✅ Responses will be natural paragraphs, not tables!")


✅ Updated to conversational smart bot style!
✅ Responses will be natural paragraphs, not tables!


In [13]:
# Complete Gradio Interface with Multi-Modal Support
def create_astrogeo_interface():
    with gr.Blocks(title="🛰️ ASTROGEO AI Pro - Free Edition") as iface:
        gr.Markdown("""
        # 🛰️ ASTROGEO AI Pro - Free Edition
        ## Multi-Modal System: Text + Image + Speech
        **Powered by Free Hugging Face Models**
        - 🤖 **Text:** GPT-OSS 120B (Free)
        - 🎨 **Images:** FLUX & SDXL (Free)
        - 🎤 **Speech:** Whisper Large (Free)
        """)

        with gr.Row():
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(label="🛰️ ASTROGEO AI", height=500)
                
                with gr.Row():
                    msg = gr.Textbox(
                        placeholder="Ask about MOSDAC, VEDAS, BHUVAN...",
                        label="Text Input",
                        scale=3
                    )
                    audio_in = gr.Audio(
                        sources=["microphone"],
                        type="filepath",
                        label="🎤 Speech Input",
                        scale=1
                    )
                
                with gr.Row():
                    model_choice = gr.Dropdown(
                        choices=["flux", "sdxl"],
                        value="flux",
                        label="🎨 Image Model"
                    )
                    send_btn = gr.Button("🚀 Send", variant="primary")
                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")

            with gr.Column(scale=1):
                gen_img = gr.Image(label="🖼️ Generated Image", visible=True)
                status = gr.Textbox(label="📊 Status", visible=True, lines=3)

        # Processing function
        def process_all(msg_text, chat_hist, audio_path, img_model):
            try:
                # Handle speech input
                asr_text = None
                if audio_path:
                    asr_text = asr.transcribe(audio_path)
                
                # Use text or speech input
                input_text = msg_text or (asr_text if asr_text else "")
                if not input_text:
                    return chat_hist, "", None, None, "❌ No input provided"
                
                # Get text response
                response = smart_bot(input_text, chat_hist, asr_text)
                
                # Generate image if requested
                img, img_status = None, "No image generation requested"
                if any(word in input_text.lower() for word in ["image", "visualize", "generate", "show", "create"]):
                    img, img_status = image_gen.generate(input_text, img_model)
                
                # Update chat
                chat_hist = chat_hist + [(input_text, response)]
                
                # Status update
                status_msg = f"✅ Response generated\n🎨 {img_status}\n📊 Using Free HF Models"
                
                return chat_hist, "", None, img, status_msg
                
            except Exception as e:
                error_msg = f"❌ Error: {str(e)}"
                chat_hist = chat_hist + [(input_text or "Error", error_msg)]
                return chat_hist, "", None, None, f"❌ Error: {str(e)}"

        # Event handlers
        send_btn.click(
            process_all,
            inputs=[msg, chatbot, audio_in, model_choice],
            outputs=[chatbot, msg, audio_in, gen_img, status]
        )
        
        clear_btn.click(
            fn=lambda: ([], "", None, None, "Ready for questions..."),
            outputs=[chatbot, msg, audio_in, gen_img, status]
        )

    return iface

# Create interface
iface = create_astrogeo_interface()
print("✅ Complete multimodal interface ready (100% FREE)")


✅ Complete multimodal interface ready (100% FREE)


  chatbot = gr.Chatbot(label="🛰️ ASTROGEO AI", height=500)


In [14]:
# Launch on a random port for safety
iface.launch(
    share=True,
    inbrowser=True,
    show_error=True
)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://ae26a409a3f45c230d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


