In [None]:
# -------------------------------
# 1. Install dependencies
# -------------------------------
# (Uncomment if running in Colab/Jupyter)
 !pip install -r requirements.txt


In [None]:
# -------------------------------
# 2. Environment Setup
# -------------------------------
import os
import json
import tempfile
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader

# Load .env file
load_dotenv()

# --- Azure OpenAI Settings ---
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")

# --- Azure Search Settings ---
search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
search_key = os.getenv("AZURE_SEARCH_KEY")
index_name = os.getenv("AZURE_INDEX_NAME")

# --- Azure Storage Settings ---
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
container_name = os.getenv("AZURE_STORAGE_CONTAINER")

# Check variables
for k in [
    "AZURE_OPENAI_ENDPOINT","AZURE_OPENAI_KEY","AZURE_OPENAI_VERSION","AZURE_OPENAI_DEPLOYMENT",
    "AZURE_SEARCH_ENDPOINT","AZURE_SEARCH_KEY","AZURE_INDEX_NAME",
    "AZURE_STORAGE_CONNECTION_STRING","AZURE_STORAGE_CONTAINER"
]:
    print(f"{k:40}: {'FOUND' if os.getenv(k) else 'MISSING'}")


In [None]:
# -------------------------------
# 3. Connect to Azure Blob Storage
# -------------------------------
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)
print(f"✅ Connected to container: {container_name}")
print("Files in container:", [b.name for b in container_client.list_blobs()])

In [None]:
# -------------------------------
# 4. Clear existing index
# -------------------------------
search_client = SearchClient(search_endpoint, index_name, AzureKeyCredential(search_key))

def clear_index():
    """Deletes all docs from the index in batches."""
    while True:
        count = search_client.get_document_count()
        print(f"Current count: {count}")
        if count == 0:
            print("All docs deleted ✅")
            break

        results = search_client.search("*", select=["id"], top=1000)
        ids = [doc["id"] for doc in results]

        if not ids:
            break

        search_client.delete_documents([{"id": doc_id} for doc_id in ids])
        print(f"Deleted {len(ids)} docs")

# Run once
clear_index()

In [None]:
# -------------------------------
# 5. Index documents from Blob Storage
# -------------------------------
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
)

vector_store = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

for blob in container_client.list_blobs():
    try:
        print(f"📂 Processing {blob.name}")

        # Download blob to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(blob.name)[-1]) as tmp_file:
            downloader = container_client.download_blob(blob.name)
            tmp_file.write(downloader.readall())
            tmp_path = tmp_file.name

        # Select loader
        if blob.name.lower().endswith(".pdf"):
            loader = PyPDFLoader(tmp_path)
        elif blob.name.lower().endswith((".docx", ".doc")):
            loader = UnstructuredWordDocumentLoader(tmp_path)
        else:
            print(f"⚠️ Skipping unsupported file: {blob.name}")
            continue

        docs = loader.load()
        for d in docs:
            d.metadata["source_file"] = blob.name

        split_docs = text_splitter.split_documents(docs)
        vector_store.add_documents(split_docs)

        print(f"✅ Indexed {len(split_docs)} chunks from {blob.name}")

    except Exception as e:
        print(f"❌ Error processing {blob.name}: {e}")
        continue


In [None]:
# -------------------------------
# 6. Test query
# -------------------------------
query = "Genetically Modified and Novel Foods (Labelling) (England) Regulations 2000"
results = vector_store.similarity_search(query=query, k=3, search_type="hybrid")

for i, r in enumerate(results, start=1):
    source = r.metadata.get("source_file", "unknown")
    chunk_id = r.metadata.get("chunk_id") or r.metadata.get("id") or "unknown"
    print(f"\n--- Result {i} ---")
    print("Source file :", source)
    print("Chunk id    :", chunk_id)
    print("Snippet     :", r.page_content[:400].replace("\n", " "), "...")