In [None]:
!pip install langchain langchain_community langchain_core langchain_openai langchain_text_splitters
!pip install chromadb
!pip install langchainhub
!pip install chromadb openai langchain

In [None]:
pip install PyPDF2 langchain sentence-transformers faiss-cpu openai

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.schema.document import Document

In [None]:
import PyPDF2

#pdf_path = r"C:\Users\hc_ankit\Documents\IISc_Assigment_Quiz_track\RAG\GEP-Jan-2025.pdf"
pdf_path = "/content/drive/My Drive/GEP-Jan-2025.pdf"

def extract_text_from_pdf(pdf_path):
    text = ""
    #print(pdf_path)
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

pdf_text = extract_text_from_pdf(pdf_path)
print(pdf_text[:500])  # Preview first 500 characters

In [None]:
pdf_text

In [None]:
len(pdf_text)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(pdf_text)

print(f"Total Chunks: {len(chunks)}")
print(f"Sample Chunk: {chunks[100]}")


In [None]:
print(f"Sample Chunk: {chunks[120]}")

In [None]:
# Convert to Document objects
splits = [Document(page_content=chunk) for chunk in chunks]

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#embedding_function = HuggingFaceEmbeddings(model_name="multilingual-e5-large-instruct")

# Store document chunks in ChromaDB
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function)

print("ChromaDB initialized with PDF embeddings!")


In [None]:
#from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
# Build prompt template for the question-answering system
prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always provide the citation for your answer.
Always say "Let me know if you need further help" at the end of the answer.
{context}   # This will be the context documents retrieved based on the question
Question: {question}   # This will be the question being asked
Helpful Answer:"""   # This is where the model's answer will be placed

# Create the PromptTemplate instance with the specified variables and template
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

In [None]:
# Importing Colab's userdata module to access stored secrets
from google.colab import userdata
import os
import openai

# Fetching the OpenAI API key stored in Colab Secrets
api_key = userdata.get('OPEN_API_KEY')  # <-- change this as per your secret's name

# Storing the API key in the environment variables for global access
os.environ['OPEN_API_KEY'] = api_key

# Setting the OpenAI API key for the openai package to use
openai.api_key = os.getenv('OPEN_API_KEY')


In [None]:
# vectorstore = Chroma.from_documents( documents=splits,
#                                    embedding = OpenAIEmbeddings(openai_api_key = api_key)
#                                     )
retriever = vectorstore.as_retriever()

llm = ChatOpenAI(model_name="gpt-4o", temperature=0,
                 api_key = api_key )
def format_docs(pdf_text):
    return "\\n\\n".join(doc.page_content for doc in pdf_text)


In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
result = rag_chain.invoke("what are Outlook and risks of Regional perspectives")
print(result)

In [None]:
pip install gradio

In [None]:
import gradio as gr

# 4️⃣ Define the function for Gradio UI
def answer_question(question):
    response = rag_chain.invoke(question)  # 🔹 RAG pipeline to answer queries
    return response  # Extracts the answer

# 5️⃣ Build Gradio UI
iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Enter your question"),  # User input
    outputs=gr.Textbox(label="AI Answer"),  # AI output
    title="Gradio RAG Chatbot with ChromaDB & OpenAI",
    description="Ask any question, and the AI will retrieve relevant documents and generate an answer."
)

# 6️⃣ Launch the Gradio App
iface.launch()