In [1]:
#!pip install langchain faiss-cpu transformers sentence-transformers pypdf


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting numpy<2,>=1 (from langchain)
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 61.0/61.0 kB 1.6 MB/s eta 0:00:00
Downloading faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl (13.7 MB)
   ---------------------------------------- 13.7/13.7 MB 3.7 MB/s eta 0:00:00
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
   ---------------------------------------- 345.7/345.7 kB 2.7 MB/s eta 0:00:00
Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
   ---------------------------------------- 15.8/15.8 MB 3.7 MB/s eta 0:00:00
Installing collected packages: numpy, faiss-cpu, sentence-transformers
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\Pranita\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer
from IPython.display import display
import ipywidgets as widgets

import warnings


In [3]:
# Set local cache folder for HuggingFace model
CACHE_DIR = os.path.normpath(os.path.join(os.getcwd(), "models"))

class Encoder:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L12-v2", device="cpu"):
        self.embedding_function = HuggingFaceEmbeddings(
            model_name=model_name,
            cache_folder=CACHE_DIR,
            model_kwargs={"device": device},
        )

class FaissDb:
    def __init__(self, docs, embedding_function):
        self.db = FAISS.from_documents(
            docs, embedding_function, distance_strategy=DistanceStrategy.COSINE
        )

    def similarity_search(self, question: str, k: int = 3):
        return self.db.similarity_search(question, k=k)

def load_and_split_pdfs(file_paths: list, chunk_size: int = 512):
    loaders = [PyPDFLoader(file_path) for file_path in file_paths]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2"),
        chunk_size=chunk_size,
        chunk_overlap=64,
        strip_whitespace=True,
    )
    docs = text_splitter.split_documents(pages)
    return docs


In [5]:
upload = widgets.FileUpload(accept=r"C:\Users\Pranita\OneDrive\Desktop\Data Analysis\SQL\Theory.pdf", multiple=False)
display(upload)

FileUpload(value={}, accept='C:\\Users\\Pranita\\OneDrive\\Desktop\\Pranita\\Pranita-cv\\Pranita_Dhole_CV_Recr…

In [12]:
import io

# Save the uploaded PDF
file_path = r"C:\Users\Pranita\OneDrive\Desktop\Data Analysis\SQL\Theory.pdf"
for filename, fileinfo in upload.value.items():
    file_path = filename
    with open(file_path, 'wb') as f:
        f.write(fileinfo['content'])

# Load and split
docs = load_and_split_pdfs([file_path])

# Check if any text was extracted
if not docs:
    raise ValueError("No text could be extracted from the uploaded PDF. Please try with a different file.")

print(f"Loaded {len(docs)} document chunks.")

encoder = Encoder()

# Try generating embeddings
texts = [doc.page_content for doc in docs]
embeddings = encoder.embedding_function.embed_documents(texts)

# Check if embeddings were created
if not embeddings:
    raise ValueError("Embeddings generation failed. Please check your embedding function or input texts.")

# Proceed only if everything is valid
faiss_db = FaissDb(docs, encoder.embedding_function)

print(f"PDF loaded and indexed with {len(docs)} chunks.")


Loaded 68 document chunks.
PDF loaded and indexed with 68 chunks.


In [13]:
while True:
    query = input("\nAsk a question (type 'exit' to stop): ")
    if query.lower() == 'exit':
        print("Chat ended.")
        break

    results = faiss_db.similarity_search(query)
    print("\n🔍 Top Relevant Answers:\n")
    for i, doc in enumerate(results, 1):
        print(f"{i}. {doc.page_content.strip()[:500]}\n")



Ask a question (type 'exit' to stop): what is this file about?

🔍 Top Relevant Answers:

1. Created by: Vinay Kumar Panika
Key Points:
Used to process row-by-row results.
Slower than set-based operations.
Helps in complex data manipulation.
Not recommended for large datasets.Example:

2. DELETE TRUNCATE
Removes specific rows based on a
condition using the WHERE clause.Removes all rows from the table without
any condition.
Can be rolled back using ROLLBACK if
inside a transaction.Cannot be rolled back once executed.
Slower because it logs each row deletion.Faster because it does not log individual
row deletions.
Maintains table structure and identity
column values.Resets identity column values to the initial
seed.
Index Fragmentation occurs when the logical order of index pages i

3. Created by: Vinay Kumar Panika2. RANK()
Assigns a rank to each row with the same values having the same rank, but skips ranks for
duplicate values.
Syntax:
3. DENSE_RANK()
Similar to RANK(), but does not s