In [3]:
!pip install langchain openai wikipedia-api chromadb fastapi uvicorn

!pip install langchain wikipedia-api faiss-cpu sentence-transformers gradio llama-cpp-python
!pip install huggingface_hub
!pip install -U langchain-community
!pip install faiss-cpu
!pip install wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting chromadb
  Downloading chromadb-1.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.32.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from c

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0

In [5]:
!pip install fitz

!pip uninstall pymupdf --yes
!pip install pymupdf
!pip install langchain_huggingface
!pip install pypdf
!pip install faiss-cpu pypdf langchain sentence-transformers
!pip install pypdf
!pip install faiss-cpu pypdf langchain sentence-transformers

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting traits>=6.2 (from nipype->fitz)
  Downloading traits-7.0.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting acres (from nipype->fitz)
  Downloading acres-0.3.0-py3-none-any.whl.metadata (5.5 kB)
Collecting etelemetry>=0.3.1

In [12]:
# Step 0: Install packages
!pip install -q transformers accelerate sentence-transformers faiss-cpu langchain pymupdf

# Imports
import fitz  # PyMuPDF for PDF text extraction
import faiss
import numpy as np
from google.colab import files
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline

# Step 1: Upload PDF
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]  # Get uploaded file name

# Step 2: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

pdf_text = extract_text_from_pdf(pdf_path)

# Step 3: Split Text into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
text_chunks = text_splitter.split_text(pdf_text)

# Step 4: Convert Text to Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding_model)
retriever = vectorstore.as_retriever()

# Step 5: Load the Flan-T5 model locally
qa_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512
)

llm = HuggingFacePipeline(pipeline=qa_pipeline)

# Step 6: Setup Retrieval-Augmented QA Chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 7: Ask Questions
while True:
    query = input("💬 Ask a question (or type 'exit' to quit): ")
    if query.lower() == "exit":
        break
    response = qa_chain.run(query)
    print(f"🤖 Answer: {response}")


Saving Top 50 OS Interview Questions.pdf to Top 50 OS Interview Questions (2).pdf


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=qa_pipeline)


💬 Ask a question (or type 'exit' to quit): What is the use of a namespace in Web Development? 
🤖 Answer: Named Pipes (Different Processes)
💬 Ask a question (or type 'exit' to quit): What are the newly introduced input types in HTML5?
🤖 Answer:  Pipes (Same Process)
💬 Ask a question (or type 'exit' to quit): What are the five elements that support media content in  HTML5?
🤖 Answer: HTML5 is a web browser that allows users to interact with HTML5 content.
💬 Ask a question (or type 'exit' to quit): 
🤖 Answer: a).
💬 Ask a question (or type 'exit' to quit): . What is the difference between Canvas and SVG? 
🤖 Answer: Ans : S.No
💬 Ask a question (or type 'exit' to quit): Explain the main purpose of an operating system? 
🤖 Answer: Ans : An operating system is to provide an environment in which a memory management, disk management, process management and task management.
💬 Ask a question (or type 'exit' to quit): What is demand paging? 
🤖 Answer: a memory management technique that is used in ope

KeyboardInterrupt: Interrupted by user