In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gate-dsai-llm/build-vector-index.py
/kaggle/input/gate-dsai-llm/GATE2024_DA_Sample_Paper.pdf
/kaggle/input/gate-dsai-llm/README.md
/kaggle/input/gate-dsai-llm/Chat_bot.ipynb
/kaggle/input/gate-dsai-llm/GATE_DA_2025_Question_Paper.pdf
/kaggle/input/gate-dsai-llm/GATE_DA_2025_Syllabus.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/ML-A-Probabilistic-Perspective-Murphy.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/UPenn-CIS520-Midterm2019-Solutions.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/CMU-ML-Notes.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/CS229-Stanford-Lecture-Notes-Repo-Copy.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/ML_super_cheatsheet.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/UPenn-CIS520-Final2017-Solutions.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/UPenn-CIS520-Final2018-Solutions.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/UPenn-CIS520-Midterm2022-Solutions.pdf
/kaggle/input/gate-dsai-llm/Machine-Learning/UPenn-CIS520-Mi

In [2]:
# Cell 1: Core dependencies
!pip install -q transformers accelerate torch torchvision langchain 

In [3]:
# Cell 2: Additional packages
!pip install -q langchain-huggingface langchain-community faiss-cpu pypdf pymupdf sentence-transformers 

In [4]:
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

GPU Available: True
GPU Device: Tesla T4


In [8]:
# STEP 2: DATA LOADING CONFIGURATION
from pathlib import Path

# Define paths (modify as needed)
docs_path = "/kaggle/input/gate-dsai-llm"
output_path = "/kaggle/working/my_vector_index"

print(f"Will process PDFs from: {docs_path}")
print(f"Will save index to: {output_path}\n")

Will process PDFs from: /kaggle/input/gate-dsai-llm
Will save index to: /kaggle/working/my_vector_index



In [9]:
from tqdm import tqdm
from langchain_community.document_loaders import PyMuPDFLoader

print("STEP 2: Loading documents...")

all_docs = []
pdf_files = list(Path(docs_path).rglob("*.pdf"))

if not pdf_files:
    raise ValueError("No PDF files found in the specified directory")

for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        loader = PyMuPDFLoader(str(pdf_file))
        loaded_docs = loader.load()
        all_docs.extend(loaded_docs)
    except Exception as e:
        print(f"\n⚠️ Error loading {pdf_file.name}: {str(e)}")
        continue

print(f"\n✓ Loaded {len(all_docs)} document chunks from {len(pdf_files)} PDFs\n")

STEP 2: Loading documents...


Processing PDFs: 100%|██████████| 48/48 [00:38<00:00,  1.26it/s]


✓ Loaded 11696 document chunks from 48 PDFs






In [10]:
# STEP 3: EMBEDDING SETUP
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name = "BAAI/bge-m3"

print(f"Initializing embeddings with {embedding_model_name} on GPU...")
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

print("✓ Embeddings ready\n")

Initializing embeddings with BAAI/bge-m3 on GPU...


2025-08-02 19:16:38.716855: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754162198.735008     801 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754162198.740382     801 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✓ Embeddings ready



In [22]:
import torch
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer

class DualGPUEmbeddings:
    def __init__(self, model_name="BAAI/bge-m3"):
        # Initialize models on respective GPUs
        self.model0 = SentenceTransformer(model_name, device="cuda:0").half()
        self.model1 = SentenceTransformer(model_name, device="cuda:1").half()
        
    def embed_documents(self, texts):
        # Split batch across GPUs
        half = len(texts) // 2
        texts0 = texts[:half]
        texts1 = texts[half:]
        
        with torch.no_grad():
            # Process on both GPUs
            emb0 = self.model0.encode(texts0, convert_to_tensor=True)  # On cuda:0
            emb1 = self.model1.encode(texts1, convert_to_tensor=True)  # On cuda:1
            
            # Move both tensors to CPU before concatenation
            emb0 = emb0.cpu()
            emb1 = emb1.cpu()
            
        return torch.cat([emb0, emb1]).numpy()  # Now on CPU

# 1. Clear GPU memory
torch.cuda.empty_cache()

# 2. Initialize embeddings
embeddings = DualGPUEmbeddings()

# 3. Create index (will automatically use both GPUs)
vector_store = FAISS.from_documents(all_docs, embeddings)

print("✓ Successfully created index using both GPUs")

Batches:   0%|          | 0/183 [00:00<?, ?it/s]

Batches:   0%|          | 0/183 [00:00<?, ?it/s]

✓ Successfully created index using both GPUs


In [24]:
# STEP 4: SAVE INDEX
import os

print(f"Saving index to {output_path}...")
vector_store.save_local(output_path)

# Verification
index_files = ['index.faiss', 'index.pkl']
all_saved = all(os.path.exists(f"{output_path}/{f}") for f in index_files)

if all_saved:
    print("✓ Index files verified:")
    for f in index_files:
        size = os.path.getsize(f"{output_path}/{f}")/1024/1024
        print(f"  - {f}: {size:.2f} MB")
else:
    print("⚠️ Warning: Some index files are missing")

Saving index to /kaggle/working/my_vector_index...
✓ Index files verified:
  - index.faiss: 45.69 MB
  - index.pkl: 23.12 MB
