In [2]:
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
import asyncio
from bs4 import BeautifulSoup
import json

In [3]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()

BASE_URL = "https://www.drdo.gov.in/drdo/"

async def get_internal_links():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        context = await browser.new_context()
        page = await context.new_page()

        await page.goto(BASE_URL)
        await page.wait_for_selector("body")
        html = await page.content()
        await browser.close()

        soup = BeautifulSoup(html, "html.parser")
        all_links = [urljoin(BASE_URL, a['href']) for a in soup.find_all("a", href=True)]
        
        # Keep only internal DRDO links
        internal_links = set()
        for link in all_links:
            if "drdo.gov.in" in urlparse(link).netloc:
                internal_links.add(link.split("#")[0])  # Remove anchor fragments

        return list(internal_links)


In [4]:
async def scrape_page(url, browser):
    page = await browser.new_page()
    try:
        await page.goto(url, timeout=60000)
        await page.wait_for_selector("body", timeout=10000)
        content = await page.content()
        soup = BeautifulSoup(content, "html.parser")

        page_data = {
            "url": url,
            "title": soup.title.string if soup.title else "No title",
            "headings": {
                "h1": [h.get_text(strip=True) for h in soup.find_all("h1")],
                "h2": [h.get_text(strip=True) for h in soup.find_all("h2")],
                "h3": [h.get_text(strip=True) for h in soup.find_all("h3")]
            },
            "links": [
                {"text": a.get_text(strip=True), "href": a['href']}
                for a in soup.find_all("a", href=True)
            ]
        }

        await page.close()
        return page_data

    except Exception as e:
        await page.close()
        print(f"Failed to scrape {url}: {e}")
        return {"url": url, "error": str(e)}


In [6]:
import json

# Load the scraped data
with open("drdo_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Total records: {len(data)}")
print(data[0])  # View one record

Total records: 96
{'url': 'https://www.drdo.gov.in/drdo/who-is-who', 'title': 'Who is Who | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India', 'content': "Feedback\nSitemap\nFAQs\nUser account menu\nLogin\nSkip to Main Content\nScreen Reader Access\nEnglish\nहिंदी\n\nHome\nDRDO\nOrganisation\nOutreach\nCareers\nPublications\nRTI\nContact Us\n\nWho is Who\nHome  DRDO  Who Is Who\n\nShri Rajnath Singh\n\nHon'ble Raksha Mantri\n\nView Profile\n\nShri Sanjay Seth\n\nHon'ble Raksha Rajya Mantri\n\nView Profile\n\nDr Samir V Kamat\n\nSecretary, Department of Defence R&D and Chairman, DRDO\n\nView Profile\nDirectors General (Technical)\nDirectors General (Corporate)\nAdditional Financial Adviser\nNodal Officer of DRDO\n\nDr B K Das\n\nDistinguished Scientist & Director General - Electronics & Communication Systems (ECS)\n\nView Profile\n\nShri Ummalaneni Raja Babu\n\nDistinguished Scientist & Director General - Missiles and Strategic Systems (MSS)

In [7]:
import re

def clean_text(text):
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    # Remove HTML tags (if any remain)
    text = re.sub(r"<.*?>", "", text)
    # Remove unwanted characters
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII
    return text.strip()

# Apply cleaning to all entries
for item in data:
    item["cleaned_content"] = clean_text(item["content"])

In [8]:
with open("drdo_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ Cleaned data saved to drdo_cleaned.json")

✅ Cleaned data saved to drdo_cleaned.json


clean data Done!

Now we will start embedding (vectorization)

In [9]:
import json

# Load the cleaned DRDO data
with open("drdo_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract texts
texts = [item["cleaned_content"] for item in data]


In [10]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 3/3 [00:04<00:00,  1.64s/it]


In [11]:
import faiss
import numpy as np

embedding_array = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embedding_array.shape[1])
index.add(embedding_array)

print("Total vectors indexed:", index.ntotal)

Total vectors indexed: 96


In [12]:
faiss.write_index(index, "drdo_faiss.index")

metadata = [{"title": d["title"], "url": d["url"]} for d in data]
with open("drdo_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)


In [13]:
# Load saved data (if starting fresh)
# index = faiss.read_index("drdo_faiss.index")
# with open("drdo_metadata.json", "r", encoding="utf-8") as f:
#     metadata = json.load(f)

def search_faiss(query, k=5):
    query_vec = model.encode([query]).astype("float32")
    distances, indices = index.search(query_vec, k)
    
    for idx in indices[0]:
        print(f"🔎 Title: {metadata[idx]['title']}")
        print(f"🌐 URL: {metadata[idx]['url']}\n")

# Try it
search_faiss("missile research")


🔎 Title: Missiles and Strategic Systems | DRDO
🌐 URL: https://www.drdo.gov.in/drdo/technical-clusters/missiles-and-strategic-systems

🔎 Title: Press Release | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India
🌐 URL: https://www.drdo.gov.in/drdo/press-release

🔎 Title: Akash | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India
🌐 URL: https://www.drdo.gov.in/drdo/akash

🔎 Title: BrahMos | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India
🌐 URL: https://www.drdo.gov.in/drdo/brahmos-0

🔎 Title: DRDO & Indian Navy successfully flight-test indigenously-developed Vertically-Launched Short-Range Surface-to-Air Missile | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India
🌐 URL: https://www.drdo.gov.in/drdo/press-release/drdo-indian-navy-successfully-flight-test-indigenously-developed-vertically-launched



In [14]:
import numpy as np
import gc
import torch
import pickle
import json
from sentence_transformers import SentenceTransformer

# Free up memory (useful if running multiple times)
gc.collect()
torch.cuda.empty_cache()

In [19]:

import numpy as np
import gc
import torch
import pickle
import json
from sentence_transformers import SentenceTransformer

# 🔄 Clear memory
gc.collect()
torch.cuda.empty_cache()

# ✅ Step 1: Load JSON file
json_file_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_cleaned.json"

with open(json_file_path, "r", encoding="utf-8") as f:
    drdo_data = json.load(f)  # drdo_data is a list

# ✅ Step 2: Check structure and extract texts
print("Total entries in JSON:", len(drdo_data))
print("Sample entry:", drdo_data[0])

# 👇 Update key if needed (e.g., "text", "body")
texts = [entry["content"] for entry in drdo_data if "content" in entry]

print("Extracted texts:", len(texts))
print("First text:", texts[0][:200], "...")

# ✅ Step 3: Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Uses CPU by default

# ✅ Step 4: Embed in small batches
batch_size = 2
embeddings_list = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    batch_embeddings = model.encode(batch, convert_to_numpy=True)
    embeddings_list.append(batch_embeddings)

    # Free memory
    gc.collect()
    torch.cuda.empty_cache()

# ✅ Step 5: Stack all batches into one final array
if embeddings_list:
    embeddings = np.vstack(embeddings_list)

    # ✅ Step 6: Save embeddings
    embedding_save_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_embeddings.pkl"

    with open(embedding_save_path, "wb") as f:
        pickle.dump(embeddings, f)

    print("✅ Embeddings saved to:", embedding_save_path)
else:
    print("⚠️ No embeddings created — check the 'texts' list.")


Total entries in JSON: 96
Sample entry: {'url': 'https://www.drdo.gov.in/drdo/who-is-who', 'title': 'Who is Who | Defence Research and Development Organisation - DRDO, Ministry of Defence, Government of India', 'content': "Feedback\nSitemap\nFAQs\nUser account menu\nLogin\nSkip to Main Content\nScreen Reader Access\nEnglish\nहिंदी\n\nHome\nDRDO\nOrganisation\nOutreach\nCareers\nPublications\nRTI\nContact Us\n\nWho is Who\nHome  DRDO  Who Is Who\n\nShri Rajnath Singh\n\nHon'ble Raksha Mantri\n\nView Profile\n\nShri Sanjay Seth\n\nHon'ble Raksha Rajya Mantri\n\nView Profile\n\nDr Samir V Kamat\n\nSecretary, Department of Defence R&D and Chairman, DRDO\n\nView Profile\nDirectors General (Technical)\nDirectors General (Corporate)\nAdditional Financial Adviser\nNodal Officer of DRDO\n\nDr B K Das\n\nDistinguished Scientist & Director General - Electronics & Communication Systems (ECS)\n\nView Profile\n\nShri Ummalaneni Raja Babu\n\nDistinguished Scientist & Director General - Missiles and S

In [20]:
import faiss
import numpy as np
import pickle
import os

# ✅ Load your saved embeddings
embedding_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_embeddings.pkl"
with open(embedding_path, "rb") as f:
    embeddings = pickle.load(f)

# ✅ Convert to float32 (FAISS requirement)
embeddings = embeddings.astype("float32")

# ✅ Create FAISS index (for cosine similarity use L2 with normalized vectors)
faiss.normalize_L2(embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # IP = Inner Product for cosine similarity

# ✅ Add embeddings to index
index.add(embeddings)

# ✅ Save FAISS index (optional but recommended)
faiss_index_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_faiss.index"
faiss.write_index(index, faiss_index_path)

print("✅ FAISS index created and saved to:", faiss_index_path)


✅ FAISS index created and saved to: C:\Users\hp123\OneDrive\Desktop\DRDO1\drdo_faiss.index


In [21]:
import faiss
import pickle
import json
import numpy as np
from sentence_transformers import SentenceTransformer

# ✅ Load FAISS index
faiss_index_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_faiss.index"
index = faiss.read_index(faiss_index_path)

# ✅ Load original cleaned data (again)
json_file_path = "C:\\Users\\hp123\\OneDrive\\Desktop\\DRDO1\\drdo_cleaned.json"
with open(json_file_path, "r", encoding="utf-8") as f:
    drdo_data = json.load(f)

# ✅ Extract texts
texts = [entry["cleaned_content"] for entry in drdo_data]

# ✅ Load the same model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 🔍 Your search query
query = "who is the current chairman of DRDO?"

# ✅ Embed and normalize the query
query_embedding = model.encode([query], convert_to_numpy=True).astype("float32")
faiss.normalize_L2(query_embedding)

# ✅ Perform search
k = 3  # Number of top matches to return
distances, indices = index.search(query_embedding, k)

# ✅ Show results
print("🔎 Top results:\n")
for i, idx in enumerate(indices[0]):
    print(f"Rank {i+1} | Score: {distances[0][i]:.4f}")
    print(texts[idx][:500])  # Print first 500 characters of match
    print("-" * 80)


🔎 Top results:

Rank 1 | Score: 0.4198
Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English  Home DRDO Organisation Outreach Careers Publications RTI Contact Us E-Journal Services Home E Journal Services Home E-Journals E-Library List of Publishers Training/Webinars Organized SOP for Fair Use of E-Journals Contact Us About E-Journal Services The DRDO E-Journal Services came into being w.e.f. 01 January 2009. DRDO e-journal consortium facilitates sharing of resources and improving access to i
--------------------------------------------------------------------------------
Rank 2 | Score: 0.3496
Feedback Sitemap FAQs User account menu Login Skip to Main Content Screen Reader Access English  Directorate Home About Directorate Director Roles & Responsibilities DIA-CoEs Formats Call for Proposal Governance Mechanism Information Industry Engagement Contact Us DRDO Industry Academia Centres of Excellence Home Corporate Directorates Directorate of Fut

responses

In [None]:
import requests

MISTRAL_API_KEY = "3Kid2WgrFVgojoiZd3n0iYUodfM93o8B"
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {MISTRAL_API_KEY}"
}

payload = {
    "model": "mistral-tiny", 
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"}
    ]
}

try:
    response = requests.post(MISTRAL_ENDPOINT, headers=headers, json=payload, timeout=10)
    response.raise_for_status()
    data = response.json()
    print(data["choices"][0]["message"]["content"])
except requests.exceptions.Timeout:
    print("Request timed out.")
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")


The capital of France is Paris. It is one of the most famous cities in the world, known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral. Paris is also a global center for art, fashion, gastronomy, and culture.


In [33]:
import requests

MISTRAL_API_KEY = "3Kid2WgrFVgojoiZd3n0iYUodfM93o8B"  # Replace with your real key
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {MISTRAL_API_KEY}"
}

payload = {
    "model": "mistral-medium",  # You can also try "mistral-medium"
    "messages": [
        {"role": "user", "content": "The capital of France is Paris."}
    ]
}

response = requests.post(MISTRAL_ENDPOINT, headers=headers, json=payload)

data = response.json()

print("AI response:", data["choices"][0]["message"]["content"])


AI response: Correct! Paris is indeed the capital city of France. It is known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and Champs-Élysées. Paris is also renowned for its rich history, art, culture, fashion, and cuisine. As a major global center for art, commerce, fashion, gastronomy, and culture, Paris is considered one of the world's most important and influential global cities.


In [36]:
import requests

MISTRAL_API_KEY = "3Kid2WgrFVgojoiZd3n0iYUodfM93o8B"
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {MISTRAL_API_KEY}",
    "Content-Type": "application/json"
}

payload = {
    "model": "mistral-small",
    "messages": [
        {"role": "user", "content": "what is the purpose of DRDO?"}
    ]
}

response = requests.post(MISTRAL_ENDPOINT, headers=headers, json=payload)

data = response.json()
print("AI:", data["choices"][0]["message"]["content"])

AI: DRDO (Defence Research and Development Organization) is the premier research and development organization of India, working under the administrative control of the Ministry of Defence. The purpose of DRDO is to develop advanced weapon systems and technologies for the Indian armed forces, in order to enhance their capabilities and ensure the nation's security. DRDO's mission is to empower India with cutting-edge defense technologies and ensure self-reliance in defense capabilities.

DRDO's activities cover a wide range of areas, including aeronautics, armaments, electronics, combat vehicles, engineering systems, missiles, materials, naval systems, and life sciences. DRDO's research and development efforts are focused on meeting the defense needs of the country by providing state-of-the-art solutions to the armed forces. In addition to its defense-related activities, DRDO also contributes to the civilian sector by developing and transferring technologies for societal benefits.

In su

In [None]:
import requests

MISTRAL_API_KEY = "3Kid2WgrFVgojoiZd3n0iYUodfM93o8B"
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {MISTRAL_API_KEY}",
    "Content-Type": "application/json"
}

# Start with a system message
messages = [{"role": "system", "content": "You are a helpful assistant."}]

while True:
    user_input = input("You: ")
    
    if user_input.lower() in ["exit", "quit"]:
        print("Chat ended.")
        break

    messages.append({"role": "user", "content": user_input})

    payload = {
        "model": "mistral-medium", 
        "messages": messages
    }

    response = requests.post(MISTRAL_ENDPOINT, headers=headers, json=payload)
    data = response.json()

    ai_message = data["choices"][0]["message"]["content"]
    print("AI:", ai_message)

    # Add assistant's message to the history
    messages.append({"role": "assistant", "content": ai_message})


AI: DRDO stands for Defence Research and Development Organisation. It is an agency of the Government of India, charged with the military's research and development, headquartered in New Delhi, India. It was formed in 1958 by the merger of the Technical Development Establishment and the Directorate of Technical Development and Production with the Defence Science Organisation. It is under the administrative control of the Ministry of Defence, Government of India. Its vision is to empower India with cutting-edge defense technologies and a mission to achieve self-reliance in critical defense technologies and systems, while equipping the armed forces with state-of-the-art weaponry. The organization includes around 50 laboratories which are engaged in developing defense technologies covering various fields, such as aeronautics, armaments, electronics, land combat engineering, life sciences, materials, missiles, and naval systems. Some of the notable developments by DRDO include the Light Com

In [1]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")  # Default connection URL
db = client["chat_database"]  # Create or access the database (you can name it as you like)
collection = db["chat_messages"]  # Create or access the collection (you can name it as you like)

In [3]:
import requests
from pymongo import MongoClient

MISTRAL_API_KEY = "3Kid2WgrFVgojoiZd3n0iYUodfM93o8B"
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"

# MongoDB setup
client = MongoClient("mongodb://localhost:27017/")  # Connect to MongoDB server
db = client["chat_database"]  # Create/access the database
collection = db["chat_messages"]  # Create/access the collection

headers = {
    "Authorization": f"Bearer {MISTRAL_API_KEY}",
    "Content-Type": "application/json"
}

# Start with a system message
messages = [{"role": "system", "content": "You are a helpful assistant."}]

while True:
    user_input = input("You: ")
    
    if user_input.lower() in ["exit", "quit"]:
        print("Chat ended.")
        break

    messages.append({"role": "user", "content": user_input})

    payload = {
        "model": "mistral-medium",  # or whichever model you're using
        "messages": messages
    }

    response = requests.post(MISTRAL_ENDPOINT, headers=headers, json=payload)
    data = response.json()

    ai_message = data["choices"][0]["message"]["content"]
    print("AI:", ai_message)

    # Add assistant's message to the history
    messages.append({"role": "assistant", "content": ai_message})

    # Store the conversation (both user and assistant messages) in MongoDB
    chat_data = {
        "user_message": user_input,
        "assistant_message": ai_message,
        "timestamp": data["created"]  # You can add a timestamp or any other info if you like
    }

    # Insert chat data into the MongoDB collection
    collection.insert_one(chat_data)


AI: DRDO stands for Defence Research and Development Organisation. It is an agency of the Government of India, charged with the military's research and development, headquartered in New Delhi, India. It was formed in 1958 by the merger of the Technical Development Establishment and the Directorate of Technical Development and Production with the Defence Science Organisation. It is under the administrative control of the Ministry of Defence, Government of India. DRDO has a network of 52 laboratories which are engaged in developing defence technologies covering various fields, such as aeronautics, armaments, electronics, land combat engineering, life sciences, materials, missiles, and naval systems. The organization includes around 5,000 scientists belonging to the Defence Research & Development Service (DRDS) and about 25,000 other scientific, technical and supporting personnel.
AI: DRDO has a network of 52 laboratories which are engaged in developing defence technologies covering vario