In [48]:
# =====================================
# MODULE 1: DEPENDENCIES AND IMPORTS
# =====================================
# Run this cell first to install and import all required dependencies

# Install dependencies (run once)
# !pip install langchain langchain-community chromadb pypdf sentence-transformers faiss-cpu numpy huggingface-hub requests torch
# ollama run qwen3:0.6b 

import os
import time
from typing import List, Dict, Any
import logging
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Core libraries
import chromadb
from chromadb.config import Settings
import faiss
import numpy as np

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document


# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All dependencies imported successfully!")
print("📁 Make sure your PDF files are in the './gnidp_pdfs/' directory")
print("🤖 Make sure Ollama is running with: ollama serve")
print("📦 Make sure gemma3:4b model is pulled: ollama pull gemma3:4b")

✅ All dependencies imported successfully!
📁 Make sure your PDF files are in the './gnidp_pdfs/' directory
🤖 Make sure Ollama is running with: ollama serve
📦 Make sure gemma3:4b model is pulled: ollama pull gemma3:4b


In [49]:
# Additional imports for caching
from diskcache import Cache
from functools import lru_cache
from typing import List, Dict, Any, Optional  # Add Optional to existing imports
import hashlib
import json

# Cache module
class QueryCache:
    """Handles multi-level caching for GNIDP RAG system"""
    
    def __init__(self, cache_dir: str = "cache", ttl: int = 3600):
        self.memory_cache: Dict[str, Any] = {}
        self.cache_dir = cache_dir
        self.ttl = ttl  # Time-to-live in seconds
        os.makedirs(cache_dir, exist_ok=True)
        self.disk_cache = Cache(cache_dir)
        
    def _generate_cache_key(self, query: str) -> str:
        """Generate consistent cache key from query"""
        return hashlib.sha256(query.lower().strip().encode()).hexdigest()
    
    @lru_cache(maxsize=1000)
    def _get_from_memory(self, cache_key: str) -> Optional[Dict[str, Any]]:
        """Get result from memory cache with LRU strategy"""
        result = self.memory_cache.get(cache_key)
        if result and (time.time() - result['timestamp']) < self.ttl:
            return result['data']
        return None
    
    def get(self, query: str) -> Optional[Dict[str, Any]]:
        """Get cached result using multi-level cache strategy"""
        cache_key = self._generate_cache_key(query)
        
        # Try memory cache first
        result = self._get_from_memory(cache_key)
        if result:
            return result
        
        # Try disk cache
        result = self.disk_cache.get(cache_key)
        if result:
            # Promote to memory cache
            self.set(query, result)
            return result
            
        return None
    
    def set(self, query: str, result: Dict[str, Any]) -> None:
        """Store result in both memory and disk cache"""
        cache_key = self._generate_cache_key(query)
        
        # Memory cache
        self.memory_cache[cache_key] = {
            'data': result,
            'timestamp': time.time()
        }
        
        # Disk cache
        self.disk_cache.set(cache_key, result, expire=self.ttl)
    
    def clear(self) -> None:
        """Clear all caches"""
        self.memory_cache.clear()
        self.disk_cache.clear()
        
    def get_stats(self) -> Dict[str, Any]:
        """Get cache statistics"""
        return {
            'memory_cache_size': len(self.memory_cache),
            'disk_cache_size': len(self.disk_cache),
            'cache_dir': self.cache_dir,
            'ttl': self.ttl
        }

In [50]:
# =====================================
# MODULE 2: CONFIGURATION SETTINGS
# =====================================
# Configure all parameters for the RAG system

class GNIDPConfig:
    """Configuration class for GNIDP RAG Chatbot"""
    
    def __init__(self):
        # File paths
        self.PDF_DIRECTORY = r"C:\Users\ACER\Documents\NIC_intern\Little Andaman\D_set"  # UPDATE THIS PATH TO YOUR PDF DIRECTORY
        self.VECTORSTORE_DIR = r"C:\Users\ACER\Documents\NIC_intern\Little Andaman\V_set"  # Vector store persistence directory
        self.VECTORSTORE_FILENAME = "LA_vectorstore"  # Removed .faiss extension
        self.REBUILD_VECTORSTORE = False # Changed to True to force rebuild
        
        # Model settings
        self.OLLAMA_MODEL = "qwen3:0.6b"  # Ollama model for LLM
        self.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
        
        # Performance-optimized chunking parameters
        self.CHUNK_SIZE = 500  # Down from 1000
        self.CHUNK_OVERLAP = 20  # Reduced from 50
        
        # Vector store settings
        self.VECTORSTORE_TYPE = "faiss"  # "faiss" for speed, "chroma" for persistence
        self.RETRIEVAL_K = 1  # Reduced from 3 for faster retrieval
        
        # LLM parameters
        self.TEMPERATURE = 0.2
        self.TOP_K = 1
        self.TOP_P = 0.9
        self.NUM_CTX = 512  # Reduced context window for faster processing
        
        # GNIDP-specific keywords for filtering
        self.GNIDP_KEYWORDS = [
            # Island Names & Geography
    'little andaman', 'little andaman island', 'south andaman', 'andaman islands',
    'bay of bengal', 'andaman sea', 'ten degree channel', 'car nicobar',
    
    # Administrative & Locations
    'hut bay', 'dugong creek', 'butler bay', 'netaji nagar', 'harminder bay',
    'jackson creek', 'whisper wave', 'kala pathar', 'white surf', 'lalaji bay',
    'rampur', 'govind nagar', 'om shanti', 'khari nadi', 'bumila',
    
    # Beaches & Coastal Areas
    'butler bay beach', 'kala pathar beach', 'white surf beach', 'whisper wave beach',
    'netaji nagar beach', 'harminder bay beach', 'lalaji bay beach', 'om shanti beach',
    'beach', 'coastline', 'shore', 'bay', 'creek', 'surf point',
    
    # Indigenous Communities & Tribes
    'onge', 'onge tribe', 'indigenous people', 'tribal community', 'primitive tribe',
    'particularly vulnerable tribal group', 'pvtg', 'aboriginal', 'native population',
    'tribal rights', 'tribal land', 'forest rights', 'customary rights',
    
    # Wildlife & Biodiversity
    'elephant', 'asian elephant', 'wild elephant', 'elephant corridor',
    'saltwater crocodile', 'estuarine crocodile', 'sea turtle', 'olive ridley',
    'leatherback turtle', 'green turtle', 'hawksbill turtle', 'turtle nesting',
    'dugong', 'dolphins', 'whales', 'coral reef', 'mangroves', 'tropical forest',
    'endemic species', 'biodiversity', 'wildlife sanctuary', 'marine life',
    'bird watching', 'migratory birds', 'nesting sites', 'breeding grounds',
    
    # Flora & Forest
    'tropical rainforest', 'evergreen forest', 'littoral forest', 'mangrove forest',
    'padauk', 'gurjan', 'mahua', 'bamboo', 'rattan', 'medicinal plants',
    'timber', 'forest cover', 'deforestation', 'forest conservation',
    
    # Marine & Aquatic
    'coral reef', 'coral bleaching', 'marine ecosystem', 'seagrass beds',
    'fishing', 'marine protected area', 'coastal zone', 'tidal zone',
    'marine biodiversity', 'reef fish', 'shark', 'ray', 'barracuda',
    
    # Tourism & Activities
    'eco-tourism', 'beach tourism', 'adventure tourism', 'surfing', 'scuba diving',
    'snorkeling', 'water sports', 'jungle trekking', 'elephant safari',
    'nature walk', 'bird watching', 'fishing', 'boating', 'swimming',
    'tourist spots', 'tourist attractions', 'accommodation', 'resorts', 'hotels',
    
    # Transportation & Connectivity
    'helicopter', 'helicopter service', 'ship', 'ferry', 'boat service',
    'inter-island connectivity', 'port blair', 'hut bay jetty', 'landing ground',
    'road network', 'transportation', 'accessibility', 'remote location',
    
    # Development & Infrastructure
    'infrastructure development', 'road construction', 'jetty development',
    'tourism infrastructure', 'sustainable development', 'carrying capacity',
    'development pressure', 'urbanization', 'settlement', 'village development',
    
    # Environmental Concerns
    'environmental impact', 'climate change', 'sea level rise', 'coastal erosion',
    'tsunami', 'cyclone', 'natural disaster', 'vulnerability', 'adaptation',
    'conservation', 'protection', 'sustainable use', 'ecosystem services',
    'environmental degradation', 'pollution', 'waste management',
    
    # Research & Conservation
    'research station', 'scientific study', 'ecological research', 'marine research',
    'conservation project', 'wildlife protection', 'habitat conservation',
    'species protection', 'breeding program', 'monitoring', 'survey',
    
    # Cultural & Heritage
    'tribal culture', 'traditional knowledge', 'cultural heritage', 'folklore',
    'traditional practices', 'cultural preservation', 'ethnography',
    'anthropological study', 'cultural rights', 'cultural identity',
    
    # Agriculture & Livelihood
    'coconut plantation', 'rubber plantation', 'arecanut', 'paddy cultivation',
    'kitchen garden', 'shifting cultivation', 'traditional agriculture',
    'livelihood', 'subsistence farming', 'fishing community', 'forest produce',
    
    # Government & Administration
    'andaman nicobar administration', 'district collector', 'forest department',
    'tribal welfare', 'panchayat', 'local government', 'revenue village',
    'land records', 'government schemes', 'welfare programs',
    
    # Legal & Policy
    'forest rights act', 'tribal rights', 'environmental clearance',
    'coastal regulation zone', 'crz', 'wildlife protection act',
    'forest conservation act', 'island protection regulation', 'land use policy',
    
    # Challenges & Issues
    'human-elephant conflict', 'human-wildlife conflict', 'encroachment',
    'illegal logging', 'poaching', 'overfishing', 'tourism pressure',
    'waste disposal', 'sewage treatment', 'water scarcity', 'power supply',
    
    # Natural Resources
    'freshwater', 'groundwater', 'natural springs', 'water resources',
    'renewable energy', 'solar energy', 'wind energy', 'natural gas',
    'mineral resources', 'sand mining', 'stone quarrying',
    
    # Weather & Climate
    'tropical climate', 'monsoon', 'rainfall', 'humidity', 'temperature',
    'seasonal variation', 'dry season', 'wet season', 'weather pattern',
    
    # Specific Projects & Initiatives
    'eco-development', 'community-based tourism', 'sustainable tourism',
    'conservation education', 'awareness program', 'capacity building',
    'participatory management', 'stakeholder engagement',
    
    # Infrastructure & Facilities Count/Numbers
    'number of buses', 'bus service', 'public transport', 'bus routes', 'bus frequency',
    'how many buses', 'bus schedule', 'bus timings', 'local transport',
    
    'number of hospitals', 'hospital facilities', 'medical facilities', 'healthcare',
    'how many hospitals', 'primary health center', 'phc', 'dispensary', 'clinic',
    'medical services', 'emergency medical', 'ambulance service',
    
    'number of schools', 'educational institutions', 'primary school', 'high school',
    'how many schools', 'education facilities', 'school enrollment', 'teachers',
    'educational infrastructure', 'literacy rate', 'anganwadi', 'pre-school',
    
    'number of shops', 'retail stores', 'market', 'shopping facilities', 'grocery stores',
    'how many stores', 'local market', 'weekly market', 'cooperative society',
    'fair price shop', 'ration shop', 'commercial establishments',
    
    'number of atms', 'banking facilities', 'bank branches', 'how many atms',
    'financial services', 'post office', 'money transfer', 'digital banking',
    'cooperative bank', 'self help groups', 'microfinance',
    
    'number of villages', 'settlements', 'hamlets', 'revenue villages',
    'how many villages', 'village population', 'household count', 'families',
    'inhabited villages', 'tribal settlements', 'forest villages',
    
    'veterinary hospital', 'vet clinic', 'animal hospital', 'livestock care',
    'veterinary services', 'animal health', 'cattle treatment', 'pet care',
    'veterinary doctor', 'animal welfare', 'vaccination program',
    
    'fire station', 'fire brigade', 'fire service', 'emergency services',
    'fire safety', 'disaster management', 'rescue services', 'fire department',
    
    # Population & Demographics
    'population', 'total population', 'population density', 'demographic data',
    'population census', 'household size', 'age distribution', 'gender ratio',
    'tribal population', 'onge population', 'settler population', 'migrant population',
    'population growth', 'birth rate', 'death rate', 'literacy statistics',
    
    # Area & Land Measurements
    'total area', 'land area', 'geographical area', 'square kilometers', 'sq km',
    'hectares', 'area in hectares', 'island area', 'land measurement',
    'total land', 'usable land', 'cultivable land', 'agricultural area',
    
    'forest area', 'forest cover', 'forest percentage', 'wooded area',
    'reserve forest', 'protected forest', 'forest land', 'tree cover',
    'forest density', 'forest statistics', 'deforestation rate',
    
    'coastal area', 'shoreline length', 'coastline', 'beach area',
    'marine area', 'territorial waters', 'exclusive economic zone',
    
    # Road Infrastructure & Distances
    'road length', 'total road length', 'road network', 'road infrastructure',
    'paved roads', 'unpaved roads', 'motorable roads', 'all weather roads',
    'road condition', 'road connectivity', 'highway', 'state highway',
    'village roads', 'forest roads', 'beach roads',
    
    'distance to port blair', 'distance from port blair', 'distance to hut bay',
    'distance between villages', 'travel distance', 'road distance',
    'aerial distance', 'how far', 'travel time', 'journey time',
    'distance to airport', 'distance to jetty', 'distance to hospital',
    'distance to school', 'nearest town', 'nearest city',
    
    # Utilities & Services Count
    'electricity connections', 'power supply', 'solar installations', 'generators',
    'water supply', 'water connections', 'bore wells', 'hand pumps',
    'water treatment plants', 'sewage treatment', 'waste management',
    
    'telephone connections', 'mobile towers', 'internet connectivity',
    'broadband', 'digital infrastructure', 'communication facilities',
    
    'police station', 'police post', 'security', 'law and order',
    'coast guard', 'border security', 'checkpoints',
    
    # Specific Facility Numbers/Statistics
    'number of beds', 'hospital beds', 'bed capacity', 'icu beds',
    'number of doctors', 'medical staff', 'nurses', 'paramedics',
    'number of teachers', 'student teacher ratio', 'enrollment numbers',
    'number of vehicles', 'registered vehicles', 'two wheelers', 'four wheelers',
    'number of boats', 'fishing boats', 'motorboats', 'traditional boats',
    
    # Economic Data
    'per capita income', 'household income', 'poverty rate', 'employment rate',
    'unemployment', 'income statistics', 'economic indicators', 'gdp',
    'budget allocation', 'government expenditure', 'development funds',
    
    # Fair Price Shops & Public Distribution System
    'fair price shop', 'fps', 'ration shop', 'public distribution system', 'pds',
    'number of fps', 'how many fair price shops', 'ration dealers', 'pds outlets',
    'subsidized food', 'food grains', 'kerosene', 'sugar distribution',
    'monthly quota', 'food security', 'essential commodities',
    
    # Cardholders & Beneficiaries
    'ration cardholders', 'apl cardholders', 'bpl cardholders', 'aay cardholders',
    'above poverty line', 'below poverty line', 'antyodaya anna yojana',
    'number of beneficiaries', 'eligible families', 'cardholder statistics',
    'beneficiary count', 'scheme beneficiaries', 'welfare recipients',
    'food subsidy beneficiaries', 'targeted beneficiaries', 'coverage ratio',
    
    'aadhar cardholders', 'voter id cards', 'identity cards', 'documentation',
    'jan aushadhi beneficiaries', 'health insurance beneficiaries',
    'pension beneficiaries', 'scholarship recipients', 'self help group members',
    
    # Electricity Department & Infrastructure
    'electricity department', 'power department', 'electrical division',
    'power generation capacity', 'installed capacity', 'power consumption',
    'electricity production', 'power demand', 'load shedding', 'power cuts',
    'electrical connections', 'domestic connections', 'commercial connections',
    'industrial connections', 'street lighting', 'power distribution',
    'transformer capacity', 'grid capacity', 'power lines', 'electrical poles',
    'meter readings', 'electricity bills', 'tariff rates', 'power subsidy',
    'renewable energy capacity', 'solar power generation', 'wind power',
    'diesel generators', 'backup power', 'uninterrupted power supply',
    'power outages', 'electrical faults', 'maintenance schedule',
    
    # Water Department & Supply System
    'water department', 'public health engineering', 'phed', 'water supply department',
    'water treatment plant', 'wtp capacity', 'water production capacity',
    'daily water production', 'water storage capacity', 'reservoir capacity',
    'overhead tank capacity', 'underground tank capacity', 'water distribution',
    'pipe network', 'water connections', 'household water connections',
    'commercial water connections', 'institutional connections',
    'water supply hours', 'water quality', 'water testing', 'bacteriological testing',
    'chemical testing', 'water treatment', 'chlorination', 'filtration',
    'water tanker supply', 'emergency water supply', 'water shortage',
    'water conservation', 'rainwater harvesting', 'water recycling',
    'sewage treatment capacity', 'wastewater treatment', 'sewage disposal',
    
    # Fresh Water Resources & Capacity
    'freshwater sources', 'freshwater availability', 'freshwater reserves',
    'groundwater', 'groundwater table', 'water table level', 'aquifer capacity',
    'natural springs', 'stream flow', 'surface water', 'water bodies',
    'pond capacity', 'lake capacity', 'river flow', 'creek flow',
    'well capacity', 'bore well depth', 'bore well yield', 'water extraction',
    'sustainable yield', 'water recharge', 'monsoon recharge', 'infiltration rate',
    'water balance', 'water budget', 'water stress', 'water scarcity',
    'per capita water availability', 'daily water requirement', 'water demand',
    'drinking water', 'potable water', 'safe drinking water', 'water purification',
    'water storage tanks', 'community tanks', 'individual storage',
    
    # Population Sustenance & Carrying Capacity
    'carrying capacity', 'population carrying capacity', 'sustainable population',
    'population pressure', 'overpopulation', 'population limit', 'ecological footprint',
    'resource availability per capita', 'land per capita', 'water per capita',
    'food security', 'food self sufficiency', 'food production capacity',
    'agricultural productivity', 'crop yield', 'food grains production',
    'fish production', 'protein availability', 'nutritional security',
    'calorie availability', 'malnutrition rate', 'undernourishment',
    'food distribution', 'food access', 'food affordability', 'food wastage',
    
    'livelihood sustainability', 'employment capacity', 'job opportunities',
    'income generation', 'economic sustainability', 'resource depletion',
    'environmental degradation', 'ecological balance', 'natural resource management',
    'waste generation', 'waste disposal capacity', 'pollution load',
    'carbon footprint', 'environmental impact per capita',
    
    'healthcare capacity', 'patient load', 'doctor patient ratio',
    'educational capacity', 'student capacity', 'infrastructure load',
    'housing capacity', 'accommodation availability', 'settlement density',
    'transportation capacity', 'traffic load', 'road capacity',
    
    # Government Schemes & Programs
    'mgnrega beneficiaries', 'pmay beneficiaries', 'pradhan mantri awas yojana',
    'swachh bharat mission', 'toilet construction', 'ujjwala yojana',
    'lpg connections', 'ayushman bharat', 'health insurance coverage',
    'pradhan mantri jan dhan yojana', 'bank account holders',
    'digital india', 'skill development', 'startup schemes',
    'kisan credit cards', 'crop insurance', 'pension schemes',
    'widow pension', 'disability pension', 'old age pension',
    
    # Smart Island & NITI Aayog Development Project
    'smart island project', 'smart island initiative', 'digital island', 'smart city',
    'niti aayog proposal', 'niti aayog project', 'niti aayog vision document',
    'sustainable development of little andaman island', 'vision document',
    'little andaman development plan', 'little andaman project', 'mega project',
    'megacity plan', 'megacity project', 'smart island development',
    
    # Greenfield Coastal City Development
    'greenfield coastal city', 'new coastal city', 'planned city', 'modern city',
    'urban development', 'city planning', 'master plan', 'development zones',
    'free trade zone', 'ftz', 'special economic zone', 'sez', 'trade hub',
    'maritime hub', 'startup hub', 'financial district', 'business district',
    'commercial zone', 'industrial zone', 'residential zone', 'tourism zone',
    
    # Three Development Zones
    'zone 1', 'zone 2', 'zone 3', 'development zones', 'zoning plan',
    'financial district', 'medi metropolis', 'aerocity', 'hospital district',
    'leisure zone', 'movie metropolis', 'film city', 'entertainment district',
    'residential areas', 'housing development', 'township development',
    
    # Infrastructure Development Components
    'underwater resorts', 'underwater hotels', 'marine tourism', 'luxury resorts',
    'casinos', 'gaming', 'entertainment complex', 'golf courses', 'sports facilities',
    'convention centers', 'conference facilities', 'exhibition centers',
    'cruise terminals', 'marina development', 'yacht harbors', 'water sports facilities',
    'theme parks', 'amusement parks', 'recreational facilities',
    
    # Connectivity & Transportation
    'airport development', 'runway expansion', 'aviation infrastructure',
    'seaplane services', 'helicopter services', 'inter-island connectivity',
    'ferry services', 'high-speed connectivity', 'broadband infrastructure',
    'digital connectivity', 'submarine cables', 'satellite connectivity',
    'road network expansion', 'highway development', 'bridge construction',
    
    # Comparison with Global Cities
    'singapore model', 'hong kong model', 'compete with singapore', 'compete with hong kong',
    'international trade hub', 'global city', 'world-class infrastructure',
    'international standards', 'global competitiveness', 'trade activity',
    
    # Strategic Location & Geopolitics
    'strategic location', 'indian ocean region', 'ior', 'geopolitical importance',
    'maritime security', 'naval base', 'strategic assets', 'security concerns',
    'china containment', 'look east policy', 'act east policy', 'indo-pacific',
    
    # Project Status & Timeline
    'project status', 'project suspended', 'project cancelled', 'project timeline',
    'implementation plan', 'project phases', 'development stages', 'project funding',
    'investment requirements', 'budget allocation', 'cost estimates', 'financial viability',
    
    # Technology & Innovation
    'smart technology', 'iot implementation', 'artificial intelligence', 'digital governance',
    'e-governance', 'smart utilities', 'smart grid', 'renewable energy integration',
    'waste management technology', 'water management systems', 'traffic management',
    'smart mobility', 'electric vehicles', 'sustainable transport', 'green technology',
    
    # Environmental Concerns & Sustainability
    'environmental impact assessment', 'eia', 'environmental clearance',
    'forest clearance', 'coastal regulation zone clearance', 'crz clearance',
    'biodiversity impact', 'ecological impact', 'carbon footprint', 'green development',
    'sustainable tourism', 'eco-friendly development', 'environmental monitoring',
    'pollution control', 'waste management', 'sewage treatment', 'water treatment',
    
    # Tribal & Social Impact
    'onge tribe impact', 'tribal displacement', 'indigenous rights', 'tribal consultation',
    'rehabilitation', 'resettlement', 'social impact assessment', 'community participation',
    'stakeholder engagement', 'public consultation', 'consent process',
    'cultural preservation', 'traditional livelihood', 'tribal welfare',
    
    # Opposition & Concerns
    'conservationist concerns', 'environmental opposition', 'tribal rights activists',
    'project criticism', 'sustainability concerns', 'ecological concerns',
    'development vs conservation', 'protests', 'legal challenges', 'court cases',
    'ngo opposition', 'civil society concerns', 'expert opinions',
    
    # Economic Aspects
    'economic development', 'gdp contribution', 'employment generation', 'job creation',
    'tourism revenue', 'trade revenue', 'foreign investment', 'fdi',
    'public-private partnership', 'ppp model', 'investment opportunities',
    'economic growth', 'revenue generation', 'tax revenue', 'export earnings',
    
    # Comparative Statistics
    'statistics', 'data', 'figures', 'numbers', 'count', 'total number',
    'how many', 'quantity', 'availability', 'capacity', 'coverage',
    'percentage', 'ratio', 'rate', 'density', 'frequency', 'adequacy',
    'sufficiency', 'shortfall', 'surplus', 'deficit', 'utilization rate'
        ]

        # Cache settings
        self.CACHE_DIR = r"C:\Users\ACER\Documents\NIC_intern\Little Andaman\cache"
        self.CACHE_TTL = 3600  # Cache time-to-live in seconds
        self.ENABLE_CACHE = True
        self.MAX_MEMORY_CACHE = 1000  # Maximum items in memory cache

# Initialize configuration
config = GNIDPConfig()

print("✅ Configuration loaded successfully!")
print(f"📁 PDF Directory: {config.PDF_DIRECTORY}")
print(f"🤖 Ollama Model: {config.OLLAMA_MODEL}")
print(f"🧠 Embedding Model: {config.EMBEDDING_MODEL}")
print(f"📄 Chunk Size: {config.CHUNK_SIZE}")
print(f"🗃️ Vector Store: {config.VECTORSTORE_TYPE}")

# Verify PDF directory exists
if not os.path.exists(config.PDF_DIRECTORY):
    print(f"⚠️  WARNING: PDF directory '{config.PDF_DIRECTORY}' does not exist!")
    print(f"📝 Please create the directory and add your PDF files")
else:
    pdf_files = [f for f in os.listdir(config.PDF_DIRECTORY) if f.endswith('.pdf')]
    print(f"📚 Found {len(pdf_files)} PDF files in directory")

✅ Configuration loaded successfully!
📁 PDF Directory: C:\Users\ACER\Documents\NIC_intern\Little Andaman\D_set
🤖 Ollama Model: qwen3:0.6b
🧠 Embedding Model: sentence-transformers/all-MiniLM-L6-v2
📄 Chunk Size: 500
🗃️ Vector Store: faiss
📚 Found 6 PDF files in directory


In [51]:
# =====================================
# MODULE 3: EMBEDDING MODEL SETUP
# =====================================
# Initialize the embedding model for document vectorization

def setup_embeddings(config):
    """Initialize embedding model - optimized for speed"""
    logger.info("Loading embedding model...")
    start_time = time.time()
    
    try:
        embeddings = HuggingFaceEmbeddings(
            model_name=config.EMBEDDING_MODEL,
            model_kwargs={'device': 'cpu'},  # Use GPU if available
            encode_kwargs={
                'normalize_embeddings': True,
                'batch_size': 32  # Add batch processing
            }
        )
        
        # Test the embedding model
        test_text = "Great Nicobar Island Development Project"
        test_embedding = embeddings.embed_query(test_text)
        
        load_time = time.time() - start_time
        logger.info(f"✅ Quantized embeddings loaded in {load_time:.2f} seconds")
        logger.info(f"📐 Embedding dimension: {len(test_embedding)}")
        
        return embeddings
        
    except Exception as e:
        logger.error(f"❌ Error loading embeddings: {str(e)}")
        raise

# Initialize embeddings
print("🧠 Setting up embedding model...")
embeddings = setup_embeddings(config)
print("✅ Embedding model ready!")

2025-06-18 14:51:03,021 - INFO - Loading embedding model...
2025-06-18 14:51:03,032 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


🧠 Setting up embedding model...


2025-06-18 14:51:07,976 - INFO - ✅ Quantized embeddings loaded in 4.95 seconds
2025-06-18 14:51:07,979 - INFO - 📐 Embedding dimension: 384


✅ Embedding model ready!


In [52]:
# =====================================
# MODULE 4: DOCUMENT LOADING AND PROCESSING
# =====================================
# Load PDFs and create text chunks

from concurrent.futures import ThreadPoolExecutor

def load_and_process_documents(config, force_reload=False):
    """Load PDFs and create text chunks only if needed"""
    
    # Check if vector store exists - using simplified path checks
    faiss_path = os.path.join(config.VECTORSTORE_DIR, f"{config.VECTORSTORE_FILENAME}.faiss")
    pkl_path = os.path.join(config.VECTORSTORE_DIR, f"{config.VECTORSTORE_FILENAME}.pkl")
    
    # Skip document processing if vector store exists and no rebuild requested
    if os.path.exists(faiss_path) and os.path.exists(pkl_path) and not force_reload and not config.REBUILD_VECTORSTORE:
        logger.info("📝 Using existing vector store, skipping document processing...")
        return None
    
    logger.info("📖 Loading PDF documents...")
    start_time = time.time()
    
    try:
        # Check if directory exists and has PDFs
        if not os.path.exists(config.PDF_DIRECTORY):
            raise FileNotFoundError(f"PDF directory '{config.PDF_DIRECTORY}' not found!")
        
        pdf_files = [f for f in os.listdir(config.PDF_DIRECTORY) if f.endswith('.pdf')]
        if not pdf_files:
            raise FileNotFoundError(f"No PDF files found in '{config.PDF_DIRECTORY}'!")
        
        logger.info(f"📚 Found {len(pdf_files)} PDF files")
        
        # Load PDFs in parallel
        def process_pdf(pdf_path):
            loader = PyPDFLoader(pdf_path)
            return loader.load()
            
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_pdf, os.path.join(config.PDF_DIRECTORY, pdf)) 
                      for pdf in pdf_files]
            documents = [doc for future in futures for doc in future.result()]
            
        logger.info(f"📄 Loaded {len(documents)} document pages")
        
        # Split documents into chunks with optimized parameters
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", ". ", "? ", "! ", ": ", " ", ""],
            keep_separator=True,
            add_start_index=True,
            strip_whitespace=True
        )
        
        texts = text_splitter.split_documents(documents)
        
        # Display some statistics
        total_chars = sum(len(doc.page_content) for doc in texts)
        avg_chunk_size = total_chars // len(texts) if texts else 0
        
        load_time = time.time() - start_time
        logger.info(f"✂️  Created {len(texts)} text chunks")
        logger.info(f"📊 Average chunk size: {avg_chunk_size} characters")
        logger.info(f"⏱️  Document processing completed in {load_time:.2f} seconds")
        
        return texts
        
    except Exception as e:
        logger.error(f"❌ Error processing documents: {str(e)}")
        raise

# Process documents only if needed
print("📚 Checking document processing requirements...")
documents = load_and_process_documents(config)
if documents:
    print(f"✅ Successfully processed {len(documents)} text chunks!")
    # Display first chunk as sample
    print("\n📖 Sample chunk:")
    print("-" * 50)
    print(documents[0].page_content[:300] + "...")
    print("-" * 50)
else:
    print("✅ Using existing vector store, document processing skipped!")

2025-06-18 14:51:08,042 - INFO - 📝 Using existing vector store, skipping document processing...


📚 Checking document processing requirements...
✅ Using existing vector store, document processing skipped!


In [53]:
# =====================================
# MODULE 5: VECTOR STORE CREATION
# =====================================
# Create and setup the vector database

global vectorstore

def create_or_load_vectorstore(texts, embeddings, config):
    """Create or load vector store with persistence"""
    global vectorstore
    # Create directory if it doesn't exist
    os.makedirs(config.VECTORSTORE_DIR, exist_ok=True)
    
    logger.info(f"🗃️ {'Creating' if config.REBUILD_VECTORSTORE else 'Loading'} vector store...")
    start_time = time.time()
    
    faiss_path = os.path.join(config.VECTORSTORE_DIR, f"{config.VECTORSTORE_FILENAME}.faiss")
    pkl_path = os.path.join(config.VECTORSTORE_DIR, f"{config.VECTORSTORE_FILENAME}.pkl")
    
    # Try loading existing vector store first if not rebuilding
    if os.path.exists(faiss_path) and os.path.exists(pkl_path) and not config.REBUILD_VECTORSTORE:
        try:
            logger.info("🔄 Loading existing vector store...")
            vectorstore = FAISS.load_local(
                folder_path=config.VECTORSTORE_DIR,
                embeddings=embeddings,
                index_name=config.VECTORSTORE_FILENAME
            )
            
            load_time = time.time() - start_time
            logger.info(f"✅ Vector store loaded successfully in {load_time:.2f} seconds")
            return vectorstore
            
        except Exception as e:
            logger.error(f"❌ Error loading vector store: {str(e)}")
            logger.info("🔄 Falling back to creating new vector store")
            config.REBUILD_VECTORSTORE = True
    
    # Create new vector store if loading failed, rebuild requested, or files don't exist
    if texts and (config.REBUILD_VECTORSTORE or not (os.path.exists(faiss_path) and os.path.exists(pkl_path))):
        logger.info("🏗️ Creating new vector store...")
        vectorstore = FAISS.from_documents(
            texts, 
            embeddings,
            distance_strategy="COSINE"
        )
            
        # Save the vector store
        try:
            vectorstore.save_local(config.VECTORSTORE_DIR, config.VECTORSTORE_FILENAME)
            logger.info(f"💾 Vector store saved successfully")
        except Exception as e:
            logger.error(f"❌ Error saving vector store: {str(e)}")
            raise
        
        creation_time = time.time() - start_time
        logger.info(f"✅ Vector store creation completed in {creation_time:.2f} seconds")
        
        return vectorstore
    
    logger.error("❌ Vector store not found and no documents provided to create new one")
    raise FileNotFoundError("Vector store files not found and no documents available to create new one")

# Create or load vector store
print("🗃️ Creating/loading vector database...")

try:
    vectorstore = create_or_load_vectorstore(documents, embeddings, config)
    print("✅ Vector store ready for queries!")

    # Test the vector store
    print("\n🔍 Testing vector store with sample query...")
    test_query = "environmental impact of GNIDP"
    test_results = vectorstore.similarity_search(test_query, k=2)

    print(f"📊 Found {len(test_results)} relevant documents")
    if test_results:
        print("\n📄 Most relevant chunk:")
        print("-" * 50)
        print(test_results[0].page_content[:200] + "...")
        print("-" * 50)
except Exception as e:
    print(f"❌ Error with vector store: {str(e)}")
    print("🔧 Try setting config.REBUILD_VECTORSTORE = True to rebuild the vector store")

2025-06-18 14:51:08,073 - INFO - 🗃️ Loading vector store...
2025-06-18 14:51:08,076 - INFO - 🔄 Loading existing vector store...
2025-06-18 14:51:08,078 - ERROR - ❌ Error loading vector store: The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on the internet.).
2025-06-18 14:51:08,079 - INFO - 🔄 Falling back to creating new vector store
2025-06-18 14:51:08,081 - ERROR - ❌ Vector store not found and no documents provided to create new one


🗃️ Creating/loading vector database...
❌ Error with vector store: Vector store files not found and no documents available to create new one
🔧 Try setting config.REBUILD_VECTORSTORE = True to rebuild the vector store


In [54]:
# =====================================
# MODULE 6: LLM SETUP AND CONFIGURATION
# =====================================
# Initialize Ollama LLM and create custom prompt template

def clean_response(text: str) -> str:
    """Clean up model response by removing XML-like tags and extra whitespace"""
    import re
    
    # Remove XML-like tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Clean up extra whitespace
    text = text.strip()
    
    return text

def setup_llm(config):
    """Initialize Ollama LLM"""
    logger.info("🤖 Setting up Ollama LLM...")
    start_time = time.time()
    
    try:
        llm = Ollama(
            model=config.OLLAMA_MODEL,
            temperature=config.TEMPERATURE,
            num_ctx=config.NUM_CTX,
            num_predict=512,  # Limit response length
            top_k=config.TOP_K,
            top_p=config.TOP_P,
            repeat_penalty=1.1,  # Prevent repetitive responses
            format="json"  # Force structured output
        )

        # Test the LLM connection
        test_response = llm.invoke("Hello, are you working?")
        
        setup_time = time.time() - start_time
        logger.info(f"✅ LLM setup completed in {setup_time:.2f} seconds")
        logger.info(f"🎯 Model: {config.OLLAMA_MODEL} (GPU-accelerated)")
        logger.info(f"🌡️  Temperature: {config.TEMPERATURE}")
        
        return llm
        
    except Exception as e:
        logger.error(f"❌ Error setting up LLM: {str(e)}")
        logger.error("🔧 Make sure Ollama is running: ollama serve")
        raise

def create_prompt_template():
    """Create custom prompt template for GNIDP-focused responses"""
    
    template = """You are an expert assistant specialized in Little Andaman Island, its development projects, infrastructure, demographics, and all matters related to the Andaman & Nicobar Islands administration.

        Gather information only from the provided context and documents to give a proper structured answer to the queries. 
        Context from Knowledge Base: {context}

        User Question: {question}
        Answer:"""

    prompt = PromptTemplate(
        template=template,
        input_variables=["context", "question"]
    )
    
    logger.info("📝 Custom prompt template created")
    return prompt

# Setup LLM
print("🤖 Initializing Ollama LLM...")
llm = setup_llm(config)

# Create prompt template
print("📝 Creating custom prompt template...")
prompt_template = create_prompt_template()

print("✅ LLM and prompt template ready!")



2025-06-18 14:51:08,104 - INFO - 🤖 Setting up Ollama LLM...


🤖 Initializing Ollama LLM...


2025-06-18 14:51:17,486 - INFO - ✅ LLM setup completed in 9.38 seconds
2025-06-18 14:51:17,487 - INFO - 🎯 Model: qwen3:0.6b (GPU-accelerated)
2025-06-18 14:51:17,487 - INFO - 🌡️  Temperature: 0.2
2025-06-18 14:51:17,488 - INFO - 📝 Custom prompt template created


📝 Creating custom prompt template...
✅ LLM and prompt template ready!


In [55]:
# =====================================
# MODULE 7: QA CHAIN CREATION
# =====================================
# Create the Retrieval QA chain that combines everything

def create_qa_chain(llm, vectorstore, prompt_template, config):
    """Create the QA chain with custom prompt"""
    logger.info("🔗 Creating Retrieval QA chain...")
    start_time = time.time()
    
    try:
        # Create retrieval QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={"k": config.RETRIEVAL_K}
            ),
            chain_type_kwargs={"prompt": prompt_template},
            return_source_documents=True
        )
        
        setup_time = time.time() - start_time
        logger.info(f"✅ QA chain created successfully in {setup_time:.2f} seconds")
        logger.info(f"🔍 Retrieval documents: {config.RETRIEVAL_K}")
        
        return qa_chain
        
    except Exception as e:
        logger.error(f"❌ Error creating QA chain: {str(e)}")
        raise

def is_gnidp_related(question, keywords):
    """Check if question is related to GNIDP topics"""
    question_lower = question.lower()
    return any(keyword in question_lower for keyword in keywords)

# Create QA Chain
print("🔗 Creating Retrieval QA chain...")
qa_chain = create_qa_chain(llm, vectorstore, prompt_template, config)
print("✅ QA chain is ready!")
# =====================================

2025-06-18 14:51:17,505 - INFO - 🔗 Creating Retrieval QA chain...
2025-06-18 14:51:17,506 - INFO - ✅ QA chain created successfully in 0.00 seconds
2025-06-18 14:51:17,507 - INFO - 🔍 Retrieval documents: 1


🔗 Creating Retrieval QA chain...
✅ QA chain is ready!


In [56]:
# MODULE 8: QUERY INTERFACE AND TESTING
# =====================================
# Interactive query system and comprehensive testing

import json
import os
import time
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path

class GNIDPQuerySystem:
    """Complete query system for GNIDP RAG chatbot"""
    
    def __init__(self, qa_chain, config):
        self.qa_chain = qa_chain
        self.config = config
        self.query_count = 0
        self.total_response_time = 0
        self.cache = QueryCache(
            cache_dir=config.CACHE_DIR,
            ttl=config.CACHE_TTL
        ) if config.ENABLE_CACHE else None
        self.cache_hits = 0
        self.cache_misses = 0

    def query(self, question: str, format_output: bool = True) -> Dict[str, Any]:
        """Process a query and return comprehensive response"""
        start_time = time.time()
        self.query_count += 1
        
        if format_output:
            print(f"\n{'='*60}")
            print(f"🔍 QUERY #{self.query_count}: {question}")
            print(f"{'='*60}")
        
        # Try cache first if enabled
        if self.config.ENABLE_CACHE:
            cached_result = self.cache.get(question)
            if cached_result:
                self.cache_hits += 1
                response_time = time.time() - start_time
                if format_output:
                    print(f"🚀 Cache hit! Response time: {response_time:.2f}s")
                return cached_result

        # Cache miss - process query normally
        if self.config.ENABLE_CACHE:
            self.cache_misses += 1
        
        # Pre-filter for GNIDP relevance
        if not is_gnidp_related(question, self.config.GNIDP_KEYWORDS):
            response_time = time.time() - start_time
            result = {
                "answer": "I can only answer questions related to Little Andaman only. Please ask a question about these subjects.",
                "response_time": response_time,
                "relevant": False,
                "query_number": self.query_count,
                "cached": False
            }
            
            if format_output:
                print(f"❌ Not GNIDP-related")
                print(f"🤖 Response: {result['answer']}")
                print(f"⏱️  Response time: {response_time:.2f}s")

            # Cache the result if enabled
            if self.config.ENABLE_CACHE:
                self.cache.set(question, result)
            
            return result
        
        try:
            # Get response from QA chain
            if format_output:
                print(f"🔍 Searching vector database...")
            qa_result = self.qa_chain.invoke({"query": question})
            
            response_time = time.time() - start_time
            self.total_response_time += response_time
            
            # Clean the response
            cleaned_answer = clean_response(qa_result["result"])
            
            result = {
                "answer": cleaned_answer,
                "response_time": response_time,
                "relevant": True,
                "query_number": self.query_count,
                "cached": False
            }
            
            # Cache the result if enabled
            if self.config.ENABLE_CACHE:
                self.cache.set(question, result)
            
            if format_output:
                # Display results
                print(f"✅ GNIDP-related query processed")
                print(f"\n🤖 ANSWER:")
                print("-" * 50)
                print(result["answer"])
                print("-" * 50)
                
                print(f"⏱️  Response time: {response_time:.2f}s")
                print(f"📊 Average response time: {self.total_response_time/self.query_count:.2f}s")
                
                if self.config.ENABLE_CACHE:
                    total_queries = self.cache_hits + self.cache_misses
                    hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
                    print(f"💾 Cache Stats - Hit Rate: {hit_rate:.1f}% ({self.cache_hits}/{total_queries})")
            
            return result
            
        except Exception as e:
            response_time = time.time() - start_time
            error_result = {
                "answer": f"I encountered an error processing your question: {str(e)}",
                "response_time": response_time,
                "relevant": True,
                "error": str(e),
                "query_number": self.query_count,
                "cached": False
            }
            
            if format_output:
                print(f"❌ ERROR: {str(e)}")
                print(f"⏱️  Response time: {response_time:.2f}s")
            return error_result
    
    def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache performance statistics"""
        if not self.config.ENABLE_CACHE:
            return {"cache_enabled": False}
            
        total_queries = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
        
        return {
            "cache_enabled": True,
            "cache_hits": self.cache_hits,
            "cache_misses": self.cache_misses,
            "hit_rate": f"{hit_rate:.2f}%",
            **self.cache.get_stats()
        }
        
    def batch_query(self, questions: List[str]) -> List[Dict[str, Any]]:
        """Process multiple queries efficiently"""
        results = []
        print(f"\n🚀 BATCH PROCESSING {len(questions)} QUERIES")
        print("="*70)
        
        for question in questions:
            result = self.query(question)
            results.append(result)
        
        # Summary statistics
        successful_queries = [r for r in results if 'error' not in r]
        avg_time = sum(r['response_time'] for r in results) / len(results)
        
        print(f"\n📊 BATCH SUMMARY:")
        print(f"Total queries: {len(questions)}")
        print(f"Successful: {len(successful_queries)}")
        print(f"Average response time: {avg_time:.2f}s")
        
        return results

In [71]:
# =====================================
# MODULE 8.5: CHAT HISTORY FUNCTIONALITY
# =====================================

@dataclass
class ChatMessage:
    """Represents a single message in the chat history"""
    role: str  # 'user' or 'assistant'
    content: str
    timestamp: datetime
    metadata: Dict[str, Any] = None
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization"""
        return {
            'role': self.role,
            'content': self.content,
            'timestamp': self.timestamp.isoformat(),
            'metadata': self.metadata or {}
        }
    
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'ChatMessage':
        """Create from dictionary"""
        return cls(
            role=data['role'],
            content=data['content'],
            timestamp=datetime.fromisoformat(data['timestamp']),
            metadata=data.get('metadata', {})
        )

class ChatHistoryManager:
    """Manages chat history with context awareness for GNIDP RAG system"""
    
    def __init__(self, config, max_history: int = 50, context_window: int = 5):
        self.config = config
        self.max_history = max_history
        self.context_window = context_window  # Number of recent messages to include in context
        self.chat_history: List[ChatMessage] = []
        self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.history_file = os.path.join(config.CACHE_DIR, f"chat_history_{self.session_id}.json")
        
        # Create history directory if it doesn't exist
        os.makedirs(os.path.dirname(self.history_file), exist_ok=True)
        
        # Load existing history if available
        self._load_history()
    
    def add_message(self, role: str, content: str, metadata: Dict[str, Any] = None) -> None:
        """Add a new message to chat history"""
        message = ChatMessage(
            role=role,
            content=content,
            timestamp=datetime.now(),
            metadata=metadata or {}
        )
        
        self.chat_history.append(message)
        
        # Maintain max history limit
        if len(self.chat_history) > self.max_history:
            self.chat_history = self.chat_history[-self.max_history:]
        
        # Auto-save after each message
        self._save_history()
    
    def get_context_for_query(self, current_query: str) -> str:
        """Get relevant context from chat history for the current query"""
        if not self.chat_history:
            return ""
        
        # Get recent messages within context window
        recent_messages = self.chat_history[-self.context_window:]
        
        # Build context string
        context_parts = []
        for msg in recent_messages:
            if msg.role == 'user':
                context_parts.append(f"Previous Question: {msg.content}")
            elif msg.role == 'assistant':
                # Truncate long responses for context
                content = msg.content[:300] + "..." if len(msg.content) > 300 else msg.content
                context_parts.append(f"Previous Answer: {content}")
        
        if context_parts:
            context = "Recent conversation context:\n" + "\n".join(context_parts) + "\n\n"
            return context
        
        return ""
    
    def get_conversation_summary(self) -> Dict[str, Any]:
        """Get summary of current conversation"""
        if not self.chat_history:
            return {"total_messages": 0, "session_id": self.session_id}
        
        user_messages = [msg for msg in self.chat_history if msg.role == 'user']
        assistant_messages = [msg for msg in self.chat_history if msg.role == 'assistant']
        
        return {
            "session_id": self.session_id,
            "total_messages": len(self.chat_history),
            "user_messages": len(user_messages),
            "assistant_messages": len(assistant_messages),
            "start_time": self.chat_history[0].timestamp.isoformat() if self.chat_history else None,
            "last_activity": self.chat_history[-1].timestamp.isoformat() if self.chat_history else None,
            "conversation_duration": self._get_conversation_duration()
        }
    
    def _get_conversation_duration(self) -> str:
        """Calculate conversation duration"""
        if len(self.chat_history) < 2:
            return "0 minutes"
        
        start_time = self.chat_history[0].timestamp
        end_time = self.chat_history[-1].timestamp
        duration = end_time - start_time
        
        total_minutes = int(duration.total_seconds() / 60)
        hours = total_minutes // 60
        minutes = total_minutes % 60
        
        if hours > 0:
            return f"{hours}h {minutes}m"
        else:
            return f"{minutes}m"
    
    def display_history(self, limit: int = 10) -> None:
        """Display recent chat history"""
        if not self.chat_history:
            print("📝 No chat history available")
            return
        
        print(f"\n💬 CHAT HISTORY (Last {min(limit, len(self.chat_history))} messages)")
        print("=" * 60)
        
        recent_messages = self.chat_history[-limit:]
        
        for i, msg in enumerate(recent_messages, 1):
            role_emoji = "🧑" if msg.role == 'user' else "🤖"
            timestamp = msg.timestamp.strftime("%H:%M:%S")
            
            print(f"\n{role_emoji} {msg.role.upper()} [{timestamp}]:")
            print("-" * 40)
            
            # Truncate long messages for display
            content = msg.content[:500] + "..." if len(msg.content) > 500 else msg.content
            print(content)
            
            if msg.metadata:
                print(f"📊 Metadata: {msg.metadata}")
    
    def search_history(self, query: str, limit: int = 5) -> List[ChatMessage]:
        """Search chat history for relevant messages"""
        query_lower = query.lower()
        relevant_messages = []
        
        for msg in self.chat_history:
            if query_lower in msg.content.lower():
                relevant_messages.append(msg)
        
        return relevant_messages[-limit:]  # Return most recent matches
    
    def clear_history(self) -> None:
        """Clear all chat history"""
        self.chat_history.clear()
        self._save_history()
        print("🗑️ Chat history cleared")
    
    def export_history(self, filename: str = None) -> str:
        """Export chat history to a file"""
        if filename is None:
            filename = f"gnidp_chat_export_{self.session_id}.json"
        
        export_data = {
            "session_info": self.get_conversation_summary(),
            "messages": [msg.to_dict() for msg in self.chat_history]
        }
        
        filepath = os.path.join(self.config.CACHE_DIR, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        
        print(f"📄 Chat history exported to: {filepath}")
        return filepath
    
    def _save_history(self) -> None:
        """Save chat history to file"""
        try:
            export_data = {
                "session_info": self.get_conversation_summary(),
                "messages": [msg.to_dict() for msg in self.chat_history]
            }
            
            with open(self.history_file, 'w', encoding='utf-8') as f:
                json.dump(export_data, f, indent=2, ensure_ascii=False)
        except Exception as e:
            print(f"Warning: Failed to save chat history: {e}")
    
    def _load_history(self) -> None:
        """Load chat history from file"""
        try:
            if os.path.exists(self.history_file):
                with open(self.history_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                self.chat_history = [
                    ChatMessage.from_dict(msg_data) 
                    for msg_data in data.get('messages', [])
                ]
                
                print(f"Loaded {len(self.chat_history)} messages from history")
        except Exception as e:
            print(f"Warning: Failed to load chat history: {e}")
            self.chat_history = []

# Enhanced Query System with Chat History
class GNIDPQuerySystemWithHistory(GNIDPQuerySystem):
    """Enhanced query system with chat history support"""
    
    def __init__(self, qa_chain, config):
        super().__init__(qa_chain, config)
        self.chat_manager = ChatHistoryManager(config)
        
        # Enhanced prompt template with history context
        self.history_aware_template = """You are an expert assistant specialized in Little Andaman Island, its development projects, infrastructure, demographics, and all matters related to the Andaman & Nicobar Islands administration.

{chat_context}

Context from Knowledge Base: {context}

Current User Question: {question}

Instructions:
- Use the recent conversation context to better understand the current question
- If the question refers to previous topics, incorporate that context in your response
- Provide detailed, accurate information based on the knowledge base
- If the question is a follow-up or clarification, acknowledge the connection to previous discussion

Answer:"""
        
        # Create PromptTemplate (assuming it's imported from langchain)
        from langchain.prompts import PromptTemplate
        self.history_prompt = PromptTemplate(
            template=self.history_aware_template,
            input_variables=["chat_context", "context", "question"]
        )
    
    def query_with_history(self, question: str, format_output: bool = True) -> Dict[str, Any]:
        """Process query with chat history context"""
        start_time = time.time()
        self.query_count += 1
        
        # Add user message to history
        self.chat_manager.add_message("user", question)
        
        if format_output:
            print(f"\n{'='*60}")
            print(f"🔍 QUERY #{self.query_count}: {question}")
            print(f"{'='*60}")
        
        # Check cache first (if enabled)
        if self.config.ENABLE_CACHE:
            # Create cache key that includes recent context for better caching
            recent_context = self.chat_manager.get_context_for_query(question)
            cache_key = f"{recent_context}||{question}"
            cached_result = self.cache.get(cache_key)
            
            if cached_result:
                self.cache_hits += 1
                response_time = time.time() - start_time
                
                # Add cached response to history
                self.chat_manager.add_message("assistant", cached_result["answer"], {
                    "cached": True,
                    "response_time": response_time
                })
                
                if format_output:
                    print(f"🚀 Cache hit! Response time: {response_time:.2f}s")
                return cached_result
        
        # Cache miss - process with history context
        if self.config.ENABLE_CACHE:
            self.cache_misses += 1
        
        # Pre-filter for GNIDP relevance
        if not is_gnidp_related(question, self.config.GNIDP_KEYWORDS):
            response_time = time.time() - start_time
            answer = "I can only answer questions related to Little Andaman Island and GNIDP. Please ask a question about these subjects."
            
            result = {
                "answer": answer,
                "response_time": response_time,
                "relevant": False,
                "query_number": self.query_count,
                "cached": False
            }
            
            # Add to history
            self.chat_manager.add_message("assistant", answer, {
                "relevant": False,
                "response_time": response_time
            })
            
            if format_output:
                print(f"❌ Not GNIDP-related")
                print(f"🤖 Response: {answer}")
                print(f"⏱️ Response time: {response_time:.2f}s")
            
            return result
        
        try:
            # Get chat context
            chat_context = self.chat_manager.get_context_for_query(question)
            
            if format_output:
                print(f"🔍 Searching vector database with history context...")
            
            # Get relevant documents
            docs = self.qa_chain.retriever.get_relevant_documents(question)
            context = "\n".join([doc.page_content for doc in docs])
            
            # Create history-aware prompt
            formatted_prompt = self.history_prompt.format(
                chat_context=chat_context,
                context=context,
                question=question
            )
            
            # Get response from LLM
            response = self.qa_chain.llm.invoke(formatted_prompt)
            
            response_time = time.time() - start_time
            self.total_response_time += response_time
            
            # Clean the response
            cleaned_answer = clean_response(response)
            
            result = {
                "answer": cleaned_answer,
                "response_time": response_time,
                "relevant": True,
                "query_number": self.query_count,
                "cached": False,
                "context_used": bool(chat_context)
            }
            
            # Add to history
            self.chat_manager.add_message("assistant", cleaned_answer, {
                "relevant": True,
                "response_time": response_time,
                "context_used": bool(chat_context)
            })
            
            # Cache the result
            if self.config.ENABLE_CACHE:
                cache_key = f"{chat_context}||{question}"
                self.cache.set(cache_key, result)
            
            if format_output:
                print(f"✅ GNIDP-related query processed")
                if chat_context:
                    print(f"📚 Used conversation context from {self.chat_manager.context_window} recent messages")
                
                print(f"\n🤖 ANSWER:")
                print("-" * 50)
                print(result["answer"])
                print("-" * 50)
                
                print(f"⏱️ Response time: {response_time:.2f}s")
                print(f"📊 Average response time: {self.total_response_time/self.query_count:.2f}s")
                
                # Show conversation summary
                summary = self.chat_manager.get_conversation_summary()
                print(f"💬 Conversation: {summary['total_messages']} messages, {summary['conversation_duration']}")
            
            return result
            
        except Exception as e:
            response_time = time.time() - start_time
            error_msg = f"I encountered an error processing your question: {str(e)}"
            
            error_result = {
                "answer": error_msg,
                "response_time": response_time,
                "relevant": True,
                "error": str(e),
                "query_number": self.query_count,
                "cached": False
            }
            
            # Add error to history
            self.chat_manager.add_message("assistant", error_msg, {
                "error": True,
                "response_time": response_time
            })
            
            if format_output:
                print(f"❌ ERROR: {str(e)}")
                print(f"⏱️ Response time: {response_time:.2f}s")
            
            return error_result
    
    def show_conversation_stats(self):
        """Display conversation statistics"""
        summary = self.chat_manager.get_conversation_summary()
        
        print(f"\n📊 CONVERSATION STATISTICS")
        print("=" * 40)
        print(f"Session ID: {summary['session_id']}")
        print(f"Total Messages: {summary['total_messages']}")
        print(f"User Questions: {summary['user_messages']}")
        print(f"Assistant Responses: {summary['assistant_messages']}")
        print(f"Duration: {summary['conversation_duration']}")
        print(f"Average Response Time: {self.total_response_time/self.query_count:.2f}s" if self.query_count > 0 else "No queries yet")
        
        if self.config.ENABLE_CACHE:
            cache_stats = self.get_cache_stats()
            print(f"Cache Hit Rate: {cache_stats['hit_rate']}")

# Initialize both systems
print("🚀 Initializing GNIDP Query Systems...")
query_system = GNIDPQuerySystem(qa_chain, config)
print("✅ Basic query system ready!")

print("🔄 Upgrading Query System with Chat History...")
history_query_system = GNIDPQuerySystemWithHistory(qa_chain, config)
print("✅ Enhanced Query System with Chat History ready!")

print(f"\n🎯 TESTING COMPLETE!")
print(f"✅ System is fully operational and ready for use!")

🚀 Initializing GNIDP Query Systems...
✅ Basic query system ready!
🔄 Upgrading Query System with Chat History...
✅ Enhanced Query System with Chat History ready!

🎯 TESTING COMPLETE!
✅ System is fully operational and ready for use!


In [72]:
# =====================================
# MODULE 9: UTILITY FUNCTIONS
# ====================================

# Original quick query function (no history)
def quick_query(question: str):
    """Quick query function with simplified display formatting"""
    # Format_output=False to prevent duplicate processing output
    result = query_system.query(question, format_output=False)
    
    try:
        answer = result["answer"]
        if answer.startswith('{"') and answer.endswith('}'):
            import json
            parsed = json.loads(answer)
            answer = parsed.get("answer", answer)
    except:
        answer = result["answer"]
    
    # Single clean output
    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    print("\nAnswer:")
    print("-"*50)
    print(answer)
    print("-"*50)
    
    return None

# Enhanced utility functions with history support
def quick_query_with_history(question: str):
    """Quick query function with chat history support"""
    result = history_query_system.query_with_history(question, format_output=False)
    
    try:
        answer = result["answer"]
        if answer.startswith('{"') and answer.endswith('}'):
            import json
            parsed = json.loads(answer)
            answer = parsed.get("answer", answer)
    except:
        answer = result["answer"]
    
    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    print("\nAnswer:")
    print("-"*50)
    print(answer)
    print("-"*50)
    
    if result.get("context_used"):
        print("📚 Used previous conversation context")
    
    return None

def show_chat_history(limit: int = 10):
    """Show recent chat history"""
    history_query_system.chat_manager.display_history(limit)

def search_chat_history(query: str):
    """Search chat history for specific content"""
    results = history_query_system.chat_manager.search_history(query)
    
    if not results:
        print(f"🔍 No messages found containing '{query}'")
        return
    
    print(f"\n🔍 SEARCH RESULTS for '{query}' ({len(results)} matches)")
    print("=" * 60)
    
    for msg in results:
        role_emoji = "🧑" if msg.role == 'user' else "🤖"
        timestamp = msg.timestamp.strftime("%Y-%m-%d %H:%M:%S")
        
        print(f"\n{role_emoji} {msg.role.upper()} [{timestamp}]:")
        content = msg.content[:300] + "..." if len(msg.content) > 300 else msg.content
        print(content)
        print("-" * 30)

def export_chat_history(filename: str = None):
    """Export chat history to file"""
    return history_query_system.chat_manager.export_history(filename)

def clear_chat_history():
    """Clear all chat history"""
    history_query_system.chat_manager.clear_history()

def conversation_stats():
    """Show conversation statistics"""
    history_query_system.show_conversation_stats()
    
# Utility functions
def system_status():
    """Display current system status"""
    print("\n🔍 SYSTEM STATUS CHECK")
    print("="*40)
    
    # Check components
    components = {
        "📄 Documents": len(documents) if 'documents' in globals() and documents is not None else "Using existing vector store",
        "🧠 Embeddings": "✅ Loaded" if 'embeddings' in globals() else "❌ Not loaded",
        "🗃️  Vector Store": "✅ Ready" if 'vectorstore' in globals() else "❌ Not ready",
        "🤖 LLM": "✅ Connected" if 'llm' in globals() else "❌ Not connected",
        "🔗 QA Chain": "✅ Ready" if 'qa_chain' in globals() else "❌ Not ready"
    }
    
    for component, status in components.items():
        print(f"{component}: {status}")
    
    print(f"\n⚙️  Configuration:")
    print(f"   Model: {config.OLLAMA_MODEL}")
    print(f"   Vector Store: {config.VECTORSTORE_TYPE}")
    print(f"   Chunk Size: {config.CHUNK_SIZE}")
    print(f"   Retrieval K: {config.RETRIEVAL_K}")
    
    if query_system.query_count > 0:
        print(f"\n📊 Performance:")
        print(f"   Queries processed: {query_system.query_count}")
        print(f"   Average response time: {query_system.total_response_time/query_system.query_count:.2f}s")

def save_conversation(conversation_log: List[Dict], filename: str = None):
    """Save conversation history to file"""
    if filename is None:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"gnidp_conversation_{timestamp}.txt"
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("GNIDP RAG Chatbot Conversation Log\n")
        f.write("="*50 + "\n\n")
        
        for i, entry in enumerate(conversation_log, 1):
            f.write(f"Query {i}: {entry['question']}\n")
            f.write(f"Response: {entry['answer']}\n")
            f.write(f"Response Time: {entry['response_time']:.2f}s\n")
            f.write(f"Relevant: {entry['relevant']}\n")
            f.write("-" * 30 + "\n\n")
    
    print(f"💾 Conversation saved to {filename}")

# Display available functions
print("\n🛠️  AVAILABLE FUNCTIONS:")
print("="*50)
print("❓ quick_query(question)             - Ask without history")
print("❓ quick_query_with_history(question) - Ask with context")
print("💬 show_chat_history(limit=10)       - View recent messages")
print("🔍 search_chat_history(query)        - Search conversation")
print("📊 conversation_stats()              - Show chat statistics")
print("📄 export_chat_history(filename)     - Export conversation")
print("🗑️ clear_chat_history()              - Clear all history")
print("📊 system_status()                   - Check system status")
print("💾 save_conversation()               - Save chat history")

print("\n🎯 TESTING CHAT HISTORY FUNCTIONALITY:")
print("="*50)

# Test conversation with context
quick_query_with_history("What is the GNIDP project about?")
quick_query_with_history("What are the main environmental concerns with this project?")
quick_query_with_history("How will these concerns be addressed?")

# Show conversation stats
conversation_stats()



🛠️  AVAILABLE FUNCTIONS:
❓ quick_query(question)             - Ask without history
❓ quick_query_with_history(question) - Ask with context
💬 show_chat_history(limit=10)       - View recent messages
🔍 search_chat_history(query)        - Search conversation
📊 conversation_stats()              - Show chat statistics
📄 export_chat_history(filename)     - Export conversation
🗑️ clear_chat_history()              - Clear all history
📊 system_status()                   - Check system status
💾 save_conversation()               - Save chat history

🎯 TESTING CHAT HISTORY FUNCTIONALITY:

Question: What is the GNIDP project about?

Answer:
--------------------------------------------------
I can only answer questions related to Little Andaman Island and GNIDP. Please ask a question about these subjects.
--------------------------------------------------

Question: What are the main environmental concerns with this project?

Answer:
--------------------------------------------------
I can only ans

In [60]:
quick_query("Total Revenue land in little andaman")  # Another example query


Question: Total Revenue land in little andaman

Answer:
--------------------------------------------------
836.8144 hectares
--------------------------------------------------


In [61]:
quick_query("How many buses are there in little andaman?")  # Another example query


Question: How many buses are there in little andaman?

Answer:
--------------------------------------------------
15
--------------------------------------------------


In [62]:
quick_query("What is the vision of little andaman development project?")  # Another example query


Question: What is the vision of little andaman development project?

Answer:
--------------------------------------------------
{ "answer": "The vision of the Little Andaman Development Project is to provide full IT support and implement e-Gov Services in the A&N Islands, aiming to overcome digital divide with the rest of India." }
--------------------------------------------------


In [63]:
query_system.cache.clear()  # Clear cache if needed

In [64]:
quick_query("How much of the land in little andaman is reserved forest?")  # Another example query


Question: How much of the land in little andaman is reserved forest?

Answer:
--------------------------------------------------
The total area of the land in Little Andaman includes reserved forests, protected forests, and unclassified forests. The reserved forest area is 706.49 square kilometers.
--------------------------------------------------


In [65]:
quick_query_with_history("Little andaman total land?")  # Another example query


Question: Little andaman total land?

Answer:
--------------------------------------------------
I encountered an error processing your question: 'RetrievalQA' object has no attribute 'llm'
--------------------------------------------------


In [66]:
quick_query_with_history("Are there schools in little andaman? Id so, how many?")  # Another example query


Question: Are there schools in little andaman? Id so, how many?

Answer:
--------------------------------------------------
I encountered an error processing your question: 'RetrievalQA' object has no attribute 'llm'
--------------------------------------------------


In [67]:
quick_query_with_history("Distance form little andaman")  # Another example query


Question: Distance form little andaman

Answer:
--------------------------------------------------
I encountered an error processing your question: 'RetrievalQA' object has no attribute 'llm'
--------------------------------------------------


In [68]:
quick_query_with_history(" sea distance between port blair & little andaman")


Question:  sea distance between port blair & little andaman

Answer:
--------------------------------------------------
I encountered an error processing your question: 'RetrievalQA' object has no attribute 'llm'
--------------------------------------------------


In [69]:
quick_query_with_history("How many primary schools in little andaman?")


Question: How many primary schools in little andaman?

Answer:
--------------------------------------------------
I encountered an error processing your question: 'RetrievalQA' object has no attribute 'llm'
--------------------------------------------------
