In [None]:
!pip install openai

In [None]:
!pip install sentence-transformers --upgrade


In [None]:
!pip install ipywidgets jupyter --upgrade

In [None]:
!pip install faiss-cpu sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import pickle

class VectorDBBuilder:
    def __init__(self, json_file, faiss_index_file="vector.index", metadata_file="metadata.pkl"):
        self.json_file = json_file
        self.index_file = faiss_index_file
        self.metadata_file = metadata_file
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.metadata = []

    def build_index(self):
        with open(self.json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        texts = []
        for entry in data:
            if entry.get("cleaned_content"):
                texts.append(entry["cleaned_content"])
                self.metadata.append({
                    "url": entry["url"],
                    "title": entry["title"]
                })

        embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        dimension = embeddings.shape[1]

        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings))

        faiss.write_index(index, self.index_file)
        with open(self.metadata_file, 'wb') as f:
            pickle.dump(self.metadata, f)

        print(f"✅ FAISS index saved to {self.index_file}")
        print(f"✅ Metadata saved to {self.metadata_file}")

# Usage
if __name__ == "__main__":
    builder = VectorDBBuilder("drdo_scraped_with_pdfs2.json")
    builder.build_index()


In [None]:
def search(query, top_k=5):
    import pickle
    import faiss
    from sentence_transformers import SentenceTransformer

    index = faiss.read_index("vector.index")
    with open("metadata.pkl", "rb") as f:
        metadata = pickle.load(f)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_vector = model.encode([query])
    distances, indices = index.search(query_vector, top_k)

    results = []
    for i in indices[0]:
        results.append(metadata[i])

    return results

# Example usage
print(search("missile development heads"))