1. Install the Dependencies

In [1]:
!pip install openai langchain pymupdf tqdm pinecone sentence-transformers faiss-cpu transformers accelerate

Collecting openai
  Downloading openai-1.93.0-py3-none-any.whl.metadata (29 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (4.8 kB)
Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp311-cp3

2. Import the required Packages

In [3]:
import os
import fitz
import faiss
import numpy as np
from tqdm import tqdm
# from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from openai import OpenAI
from pinecone import Pinecone
from IPython.display import FileLink, FileLinks


  from .autonotebook import tqdm as notebook_tqdm


3. Set OpenAi API Key, Pinecone API Key and Pinecone Index

In [None]:
import getpass
openai_api_key = getpass.getpass("Enter your OpenAI API Key: ")
client = OpenAI(api_key=openai_api_key)

pinecone_api_key = getpass.getpass("Enter your Pinecone API Key: ")
os.environ["PINECONE_API_KEY"] = pinecone_api_key
pc = Pinecone(api_key=pinecone_api_key)
index_name = input("Enter Index Name")
index = pc.Index(index_name)

4. Upload the required files (I am Loading my Resume)

In [None]:
uploaded_files = files.upload()

5. Extraction and splitting Text from uploaded file(s)

In [None]:
def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        raise ValueError("Unsupported file type")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = []
for filename in uploaded_files.keys():
    raw_text = extract_text_from_file(filename)
    chunks = text_splitter.split_text(raw_text)
    for i, chunk in enumerate(chunks):
        documents.append({"id": f"{filename}-{i}", "text": chunk, "metadata": {"source": filename}})
print(f"Chunks Created: {len(documents)}")


6. Generate Embeddings and creating FAISS Index

In [None]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
texts = [doc["text"] for doc in documents]
metas = [doc["metadata"] for doc in documents]
ids = [doc["id"] for doc in documents]
embeddings = embed_model.encode(texts, show_progress_bar=True)
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))
doc_lookup = {i: {"text": texts[i], "metadata": metas[i]} for i in range(len(texts))}

7. Function for retreiving top queries(change value of k as per requirements)

In [None]:
def retrieve_top_k(query, k=3):
    query_vec = embed_model.encode([query])
    distances, indices = faiss_index.search(np.array(query_vec), k)
    return [doc_lookup[idx]["text"] for idx in indices[0]]

8. Setup the function for answering questions Using FLAN-T5 (Offline QA)[I have free usage policy for both OpenAI and pinecone]

In [None]:
def answer_query_offline(query, k=3):
    top_chunks = retrieve_top_k(query, k)
    context = "\n\n".join(top_chunks)
    prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}"
    model = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512)
    response = model(prompt)
    return response[0]['generated_text']

9. Querying the model and getting the answers

In [None]:
query = "What is the total experience mentioned in the document?"   #change the query here
answer = answer_query_offline(query)
print("Answer:\n", answer)