In [1]:
# -----------------------------------------------------------------
# Part 1: Installing and upgrading libraries
# -----------------------------------------------------------------
# This first cell installs and upgrades all the Python packages we need.
# The first line is crucial to fix binary incompatibility errors with numpy.
%pip uninstall -y numpy pandas
%pip install --upgrade numpy pandas --no-cache-dir
%pip install azure-ai-ml
%pip install langchain
%pip install "langchain-community[pdf]"
%pip install faiss-cpu
%pip install sentence-transformers
%pip install langchain-openai

print("-----------")
print("All libraries re-installed successfully!")
print("-----------")
print("-----------")
print("All libraries installed and upgraded successfully!")
print("-----------")



Found existing installation: numpy 2.2.6
Uninstalling numpy-2.2.6:
  Successfully uninstalled numpy-2.2.6
Found existing installation: pandas 2.3.1
Uninstalling pandas-2.3.1:
  Successfully uninstalled pandas-2.3.1
Note: you may need to restart the kernel to use updated packages.
Collecting numpy
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pandas-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalli

In [2]:
from langchain_community.document_loaders import PyPDFLoader
import os

# --- IMPORTANT ---
# This path points directly to your folder in the Notebooks section.
# Make sure your username '24m0008' is correct.
docs_path = "Gate_DSAI/"

print(f"Attempting to access local path: {docs_path}")
print(f"Checking if path exists: {os.path.exists(docs_path)}")
print(f"Checking if path is a directory: {os.path.isdir(docs_path)}")

# Load all PDF documents from the folder
all_docs = []
print("\nStarting to walk through the directory to find PDF files...")

# Walk through all subdirectories and files
found_pdfs = False
for root, dirs, files in os.walk(docs_path):
    print(f"\n--- Entering directory: {root} ---")
    for file in files:
        if file.endswith(".pdf"):
            found_pdfs = True # Mark that we found at least one PDF
            try:
                file_path = os.path.join(root, file)
                print(f"-> Attempting to load: {file_path}")
                loader = PyPDFLoader(file_path)
                loaded_docs = loader.load_and_split()
                all_docs.extend(loaded_docs)
                print(f"--> Successfully loaded {file}")
            except Exception as e:
                print(f"!! Could not load or process {file}. Error: {e}")

if not found_pdfs:
    print("\nWARNING: The script walked the directory but did not find any files ending with .pdf")

print("-----------")
print(f"Total documents loaded and split: {len(all_docs)}")
print("-----------")


Attempting to access local path: Gate_DSAI/
Checking if path exists: True
Checking if path is a directory: True

Starting to walk through the directory to find PDF files...

--- Entering directory: Gate_DSAI/ ---
-> Attempting to load: Gate_DSAI/GATE2024_DA_Sample_Paper.pdf
--> Successfully loaded GATE2024_DA_Sample_Paper.pdf
-> Attempting to load: Gate_DSAI/GATE_DA_2025_Question_Paper.pdf
--> Successfully loaded GATE_DA_2025_Question_Paper.pdf
-> Attempting to load: Gate_DSAI/GATE_DA_2025_Syllabus.pdf
--> Successfully loaded GATE_DA_2025_Syllabus.pdf

--- Entering directory: Gate_DSAI/Artificial-Intelligence ---
-> Attempting to load: Gate_DSAI/Artificial-Intelligence/AI_All_cheat_sheet.pdf
--> Successfully loaded AI_All_cheat_sheet.pdf
-> Attempting to load: Gate_DSAI/Artificial-Intelligence/handout1.pdf
--> Successfully loaded handout1.pdf
-> Attempting to load: Gate_DSAI/Artificial-Intelligence/handout2.pdf
--> Successfully loaded handout2.pdf
-> Attempting to load: Gate_DSAI/Artif

PdfReadError("Invalid Elementary Object starting with b'P' @4812141: b'9 0 obj<</Universal PDF(The process that creates this PDF constitutes a trade se'")
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 208 0 (offset 0)
Ignoring wrong pointing object 210 0 (offset 0)
Ignoring wrong pointing object 282 0 (offset 0)
Ignoring wrong pointing object 299 0 (offset 0)
Ignoring wrong pointing object 303 0 (offset 0)
Ignoring wrong pointing object 478 0 (offset 0)
Ignoring wrong pointing object 520 0 (offset 0)
Ignoring wrong pointing object 600 0 (offset 0)
Ignoring wrong pointing object 616 0 (offset 0)
Ignoring wrong pointing object 618 0 (offset 0)
Ignoring wrong pointing object 620 0 (offset 0)
Ignoring wrong pointing object 706 0 (offset 0)
Ignoring wrong pointing object 779 0 (offset 0)
Ignoring wrong pointing object 789 0 (offset 0)
Ignoring wrong pointing object 793 0 (offset 0)
Ignoring wrong pointing object 

In [3]:
if not all_docs:
    print("ERROR: No documents were loaded. The 'all_docs' list is empty.")
else:
    # -----------------------------------------------------------------
    # Part 3: Creating the Vector Index (The Smart Index)
    # -----------------------------------------------------------------
    from langchain_community.vectorstores import FAISS
    from langchain_community.embeddings import HuggingFaceEmbeddings

    print("Initializing the embedding model...")
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {'device': 'cpu'}
    embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
    print("Embedding model loaded.")

    print("Creating the vector index. This may take a while...")
    vector_store = FAISS.from_documents(all_docs, embeddings)
    print("Vector index created successfully!")

    output_folder_name = "my_vector_index"
    print(f"Saving index to folder: {output_folder_name}")
    vector_store.save_local(output_folder_name)
    
    # --- NEW VERIFICATION STEP ---
    # Check if the main index file was created successfully.
    expected_file = os.path.join(output_folder_name, "index.faiss")
    if os.path.exists(expected_file):
        print(f"\nVerification successful: Found '{expected_file}'.")
        print("-----------")
        print("SUCCESS! Your vector index has been created and saved.")
        print(f"You can find it in your Notebooks section in a folder named '{output_folder_name}'.")
        print("-----------")
    else:
        print("\n---!!! ERROR !!!---")
        print(f"Verification failed: The file '{expected_file}' was not found after saving.")
        print("The script may have completed without successfully writing the file.")


  embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
  from .autonotebook import tqdm as notebook_tqdm
2025-08-01 08:35:32.043932: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754037332.126015   30477 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754037332.150102   30477 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754037332.210152   30477 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754037332.210202   30477 computation_placer.cc:177] computation placer already registered. Please check linkage 

Embedding model loaded.
Creating the vector index. This may take a while...
Vector index created successfully!
Saving index to folder: my_vector_index

Verification successful: Found 'my_vector_index/index.faiss'.
-----------
SUCCESS! Your vector index has been created and saved.
You can find it in your Notebooks section in a folder named 'my_vector_index'.
-----------


In [4]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain.chains import RetrievalQA
import os

print("Loading the saved vector index...")

# The name of the folder where you saved the index
index_folder_name = "my_vector_index"

# --- NEW ERROR HANDLING ---
# Check if the index folder exists before trying to load it.
if not os.path.exists(index_folder_name):
    print(f"---!!! ERROR !!!---")
    print(f"The vector index folder '{index_folder_name}' was not found.")
    print("Please make sure you have successfully run the 'build_index.ipynb' notebook first.")
    print("You should see a folder named 'my_vector_index' in your file explorer on the left.")
    # Stop the script if the index doesn't exist.
    raise FileNotFoundError(f"Vector index folder not found: {index_folder_name}")


# The same embedding model used to create the index
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Load the vector store from the local folder
vector_store = FAISS.load_local(index_folder_name, embeddings, allow_dangerous_deserialization=True)

print("Vector index loaded successfully!")

Loading the saved vector index...
Vector index loaded successfully!
