In [None]:
pip install langchain-experimental langchain-openai langchain-huggingface langchain-community PyPDF2 ragas transformers chromadb langchain-chroma pypdf==5.0 streamlit typing

In [1]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores import Chroma # Importing Chroma vector store from Langchain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
import os # Importing os module for operating system functionalities
from chromadb import Documents,EmbeddingFunction,Embeddings
import chromadb
from chromadb.config import Settings
from langchain_openai import ChatOpenAI # the LLM we are going to use today
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import SentenceTransformerEmbeddings

In [4]:
#CREATE A NEW DIRECTORY CALLED data
import os

DATA_PATH = './data/'
os.makedirs(DATA_PATH, exist_ok=True)

In [None]:
#UPLOAD RELEVANT FILES
from google.colab import files

uploaded = files.upload()

for filename in uploaded.keys():
    os.rename(filename, os.path.join(DATA_PATH, filename))

In [2]:
DATA_PATH="./data/"
def load_documents():
  document_loader=PyPDFDirectoryLoader(DATA_PATH)
  return document_loader.load() # returns them as a list of langchain Document objects
documents=load_documents()
print(documents[0])

page_content='Karim Alizadeh
Borderland Projects of Sasanian Empire:
Intersection of Domestic and Foreign Policies
Abstract: The landscapes of the Sasanian Empire have been studied in detail and
have been remarkably well interpreted during past few decades. Recent research
in borderland areas has also increased our knowledge of Sasanian policies in
borderlands. The Sasanian Empire is well known for massive construction work.Projects such as construction of fortifications and defensive walls, irrigation
systems, fortified towns and cities in the Sasanian period usually are attributed to
the reigns of Kawad I and his son Husraw I Anushirwan in the sixth century. This
attribution mostly derives from historical documents in which Husraw is seen as
primarily responsible for these massive projects. Recent archaeological re-
searches in the Gorg ān plain in the northeast of Iran and in Mughan Steppe in
Iranian Azerbaijan have demonstrated the possibility of dating these projects
earlier in th

In [4]:
# Chunking process
def split_text(documents:list[Document]):
  text_splitter=RecursiveCharacterTextSplitter(
      chunk_size=300, # size of each chunk in characters
      chunk_overlap=100, #overlap between consecutive chunks
      length_function=len, # function to compute the length of the text
      add_start_index=True
  )
  chunks=text_splitter.split_documents(documents)
  print(f"Split {len(documents)} pages into {len(chunks)} chunks")
  return chunks

In [5]:
embedding_model=SentenceTransformer("BAAI/bge-m3")
#client = chromadb.EphemeralClient()

# creating an embedding transformer class
class SentenceTransformerEmbeddingFunction(EmbeddingFunction):
  def __init__(self,model):
    self.model=model
  def __call__(self,texts:Documents)->Embeddings:
    embeddings=self.model.encode(texts).tolist()
    return embeddings
ef_function=SentenceTransformerEmbeddingFunction(embedding_model)


In [9]:
! rm -rf ./chroma_db

In [11]:
from chromadb import PersistentClient

CHROMA_DB_DIR = "./chroma_db_fresh"
client = PersistentClient(path=CHROMA_DB_DIR)

collection = client.get_or_create_collection(
    name="documents",
    embedding_function=ef_function
)


In [12]:
def save_to_chroma(chunks: list[Document]):
    # # Add documents to collection
    ids = [f"doc_{i}" for i in range(len(chunks))]
    texts = [chunk.page_content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]
    collection.add(
        ids=ids,
        documents=texts,
        metadatas=metadatas,
    )

    print(f"Saved {len(chunks)} chunks to a chroma instance")

In [13]:
def generate_data_store():
  documents=load_documents()
  chunks=split_text(documents)
  save_to_chroma(chunks)
load_dotenv()
generate_data_store()

Split 23 pages into 291 chunks
Saved 291 chunks to a chroma instance


In [14]:
query_text="What were the impacts of irrigated agriculture in Sasanian Iran"

In [15]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context: If you don't know the answer, please say I dont have enough information to answer that.
{context}
 - -
Answer the question based on the above context: {question}
"""

In [16]:
def query_rag(query_text):
  # Retrieving the context from the DB using similarity search
  collection=client.get_collection("documents",ef_function)
  results=collection.query(query_texts=[query_text],n_results=3,include=['distances','documents','metadatas'])
  if len(results['documents'][0]) == 0:
      print(f"Unable to find any matching results.")
      return None
  context_text = "\n\n -- \n\n".join(results['documents'][0])
  prompt_template=ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt=prompt_template.format(context=context_text,question=query_text)
  model=ChatOpenAI(
      openai_api_base="https://openrouter.ai/api/v1",
      openai_api_key="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",
      temperature=0.1
  )
  response=model.invoke(prompt)
  response_text=response.content
  print(f"Response: {response_text}")
  return response_text
query_rag(query_text)


Response: The impacts of irrigated agriculture in Sasanian Iran included playing profound roles in politics, contributing to state formation, transforming landscapes in Mesopotamia and Khuzistan, and leading to an intensification of investment in cultivation.


'The impacts of irrigated agriculture in Sasanian Iran included playing profound roles in politics, contributing to state formation, transforming landscapes in Mesopotamia and Khuzistan, and leading to an intensification of investment in cultivation.'

In [None]:
import gradio as gr

# Inline CSS for styling
custom_css = """
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .chat-container {
        background-color: #f8f9fa;
        border-radius: 10px;
        padding: 20px;
        margin-bottom: 20px;
        max-height: 500px;
        overflow-y: auto;
    }
    .user-message {
        background-color: #5ba2ab;
        padding: 10px;
        border-radius: 10px;
        margin: 5px 0;
        text-align: right;
    }
    .bot-message {
        background-color: #5bab70;
        padding: 10px;
        border-radius: 10px;
        margin: 5px 0;
        text-align: left;
    }
    button {
        width: 100%;
    }
    .info-box {
        background-color: #e7f3ff;
        padding: 15px;
        border-radius: 10px;
        margin: 10px 0;
    }
</style>
"""

# Initialize chat history
if "messages" not in globals():
    messages = [{"role": "assistant", "content": "Hello! I'm a demo RAG chatbot. Try asking me anything!"}]
else:
    messages = globals()["messages"]

def ask_assistant(user_input: str, history):
    # Append user message
    history.append({"role": "user", "content": user_input})
    # Here you call your actual query_rag function to get the bot response
    bot_response = query_rag(user_input)
    # Append bot response
    history.append({"role": "assistant", "content": bot_response})
    return history, ""

def format_chat(messages):
    formatted = ""
    for msg in messages:
        if msg["role"] == "user":
            formatted += f'<div class="user-message"><strong>You:</strong> {msg["content"]}</div>'
        else:
            formatted += f'<div class="bot-message"><strong>Bot:</strong> {msg["content"]}</div>'
    return formatted

with gr.Blocks() as demo:
    gr.HTML(custom_css)
    gr.HTML('<h1 class="main-header">🤖 RAG Chatbot Demo</h1>')

    chat_display = gr.HTML(format_chat(messages), elem_id="chat-container")

    user_input = gr.Textbox(
        placeholder="Type your message here...",
        label="",
        lines=1,
        interactive=True
    )

    # Store chat messages in state
    state = gr.State(messages)

    def submit(user_message, history):
        history, _ = ask_assistant(user_message, history)
        chat_html = format_chat(history)
        return chat_html, "", history

    submit_btn = gr.Button("Send")
    submit_btn.click(submit, inputs=[user_input, state], outputs=[chat_display, user_input, state])

if __name__ == "__main__":
    demo.launch()
