In [None]:
!pip install langchain_community
!pip install pypdf
!pip install langchain_google_genai
!pip install chromadb
!pip install langchain
!pip install -U langchain-huggingface

In [32]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings

In [33]:
# Initialize embedding function using HuggingFace
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=model_name)

In [34]:
# Initialize Flan-T5 pipeline for text generation
def initialize_flan_t5():
    try:
        generator = pipeline("text2text-generation", model="google/flan-t5-large")
        print("Flan-T5 model initialized successfully!")
        return generator
    except Exception as e:
        print(f"Error initializing Flan-T5: {str(e)}")
        return None

In [35]:
# 1. Load PDF and extract text (This will only be called once)
def load_pdf(file_path):
    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        documents = pages[0].page_content + pages[1].page_content
        print("PDF loaded successfully!")
        return documents
    except Exception as e:
        print(f"Error loading PDF: {str(e)}")
        return None

# 2. Split text into smaller chunks (This will only be called once)
def split_text(documents, chunk_size=100, chunk_overlap=20):
    try:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        texts = text_splitter.create_documents([documents])
        print("Text split successfully!")
        return texts
    except Exception as e:
        print(f"Error splitting text: {str(e)}")
        return None

# 3. Create a vector store for documents
def create_vectorstore(texts):
    try:
        vectorstore = Chroma.from_documents(
            documents=texts,
            embedding=embedding_function
        )
        print("Vectorstore created successfully!")
        return vectorstore
    except Exception as e:
        print(f"Error creating vectorstore: {str(e)}")
        return None

In [36]:
# 4. Retrieve relevant documents for a query
def retrieve_documents(vectorstore, query, k=1):
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
        retrieved_docs = retriever.get_relevant_documents(query)
        print("Documents retrieved successfully!")
        return retrieved_docs
    except Exception as e:
        print(f"Error retrieving documents: {str(e)}")
        return []

In [37]:
# 5. Generate a response using Flan-T5
def generate_response_flan(retrieved_docs, query, generator):
    try:
        context = " ".join([doc.page_content for doc in retrieved_docs])
        prompt = f"Based on the following context:\n{context}\nAnswer the query: {query}"
        answer = generator(prompt, max_length=500)
        return answer[0]["generated_text"]
    except Exception as e:
        print(f"Error generating response with Flan-T5: {str(e)}")
        return "Error generating response"

In [38]:
# 6. Initialize the environment
def initialize_environment(file_path, chunk_size=100, chunk_overlap=20):
    documents = load_pdf(file_path)
    if not documents:
        return None, None

    texts = split_text(documents, chunk_size, chunk_overlap)
    if not texts:
        return None, None

    vectorstore = create_vectorstore(texts)
    if not vectorstore:
        return None, None

    return vectorstore, texts

In [39]:
# 7. Query function
def run_query_flan(vectorstore, query, generator, k=1):
    retrieved_docs = retrieve_documents(vectorstore, query, k)
    if not retrieved_docs:
        print("No documents retrieved.")
        return

    answer = generate_response_flan(retrieved_docs, query, generator)
    print("Answer:")
    print(answer)

In [40]:
# Example usage:
file_path = "/content/Resume.pdf"
vectorstore, texts = initialize_environment(file_path)
generator = initialize_flan_t5()

PDF loaded successfully!
Text split successfully!
Vectorstore created successfully!


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Flan-T5 model initialized successfully!


In [41]:
if vectorstore and generator:
    query_1 = "Tell me about Shahnoor's experience mentioned in the resume"
    run_query_flan(vectorstore, query_1, generator)

    query_2 = "What are Shahnoor's skills, Mention all Tech Skills?"
    run_query_flan(vectorstore, query_2, generator)

    query_3 = "What is the relationship between Shahnoor and Toufiq"
    run_query_flan(vectorstore, query_3, generator)

else:
    print("Failed to initialize the environment or model.")


Documents retrieved successfully!
Answer:
Jr. Software Engineer
Documents retrieved successfully!
Answer:
Jr. Software Engineer
Documents retrieved successfully!
Answer:
brother
