In [None]:
!apt install tesseract-ocr -y
!apt install tesseract-ocr-ben -y
!apt install poppler-utils -y
!pip install pytesseract pdf2image

In [None]:
# Installation of packages
!pip install langchain
#framework that use LLM and tools to build agent
!pip install langchain_community
#supportive tools(PDF loader,Web loader) for langchain
!pip install langchain_core
#for minimal and low level control
!pip install langchain_openai
#bridge between langchain and openai
!pip install transformers
#pretrained brain(LLM,encode,tokenize)for chatbot
!pip install sentence_transformers
#use for converting text into vectors
!pip install faiss-cpu
#for semantic search in vector database
!pip install pypdf
#for reading PDF and text extraction
!pip install fastapi
#building a backend service that responds to HTTP requests like GET, POST, etc
!pip install uvicorn
#server that runs your API so it can receive and respond to requests.
!pip install gdown
#for PDF dowload from drivet
!pip install google-generativeai

In [None]:
#Import Libraries
import os
import re
import gdown
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from pdf2image import convert_from_path
import pytesseract
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
from fastapi import FastAPI, Request

In [None]:
#Download PDF from google Drive
pdf_url = "https://drive.google.com/uc?id=19h7t3xVNBS5KR9o2sLMh3i2uaje7ww7n"
output = "hsc26_bangla_1st_paper.pdf"

gdown.download(pdf_url,output,quiet=False)

In [None]:
#Document loading
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
custom_config = r'--oem 3 --psm 6 -l ben'
pages = convert_from_path("hsc26_bangla_1st_paper.pdf",dpi=300)

text = ""
for i, page in enumerate(pages):
    print(f"Processing page {i+1}/{len(pages)}...")
    extracted = pytesseract.image_to_string(page, config=custom_config)
    text += extracted + "\n\n"

with open("ocr_output.txt", "w", encoding="utf-8") as f:
    f.write(text)

ocr_doc = [Document(page_content=text)]

In [None]:
print("✅ Sample Bangla OCR Output:\n")
print(text[:1000])

In [None]:
#Clean the text function
def clean_ocr_text(text):
  text = re.sub(r'\n+', '\n', text)
  text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', text)
  text = re.sub(r'[^\x00-\x7F\u0980-\u09FF]+', ' ', text)
  text = re.sub(r'\s{2,}', ' ', text)
  return text.strip()

cleaned_text = clean_ocr_text(text)

In [None]:
ocr_cleaned_document= [Document(page_content=cleaned_text)]

In [None]:
print("Cleaned OCR text sample:\n")
print(cleaned_text[:1000])

In [None]:
with open("ocr_cleaned_output.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)


In [None]:
#Create chunk from the cleaned text
splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "।", "\n", " ", ""]
)

chunk_document=splitter.split_documents(ocr_cleaned_document)

In [None]:
len(chunk_document)

In [None]:
for i, chunk in enumerate(chunk_document[:3]):
    print(f"--- Chunk {i+1} ---\n{chunk.page_content}\n")


In [None]:
chunk_document[1]

In [None]:
#Setup Embedding model
embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
# Convert text into vector
vector_store=FAISS.from_documents(chunk_document,embedding_model)

In [None]:
# Save vector store to disk
vector_store.save_local("FAISS_index_vector_store_hsc_bangla_pdf")

In [None]:
load_document=FAISS.load_local(
    "FAISS_index_vector_store_hsc_bangla_pdf",
    embedding_model,
    allow_dangerous_deserialization=True
)

In [None]:
#Create the retriver
retriever=load_document.as_retriever(
    search_type="similarity",
    search_kwargs={"k":5}
)

In [None]:
#create a LLM
token = input("Enter your API Key")
os.environ["OPENAI_API_KEY"] = token

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    base_url="https://api.openai.com/v1",
    api_key=token,
    temperature=0.7
)




In [None]:
#create compressor
compressor=LLMChainExtractor.from_llm(llm)
#create compressed retriver
compressed_retriver=ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

In [None]:
#Prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
তুমি একজন সহায়ক শিক্ষক। নিচের ডকুমেন্ট অংশের ভিত্তিতে প্রশ্নের উত্তর দাও।
শুধু ডকুমেন্টের তথ্য ব্যবহার করো, নিজে কিছু যোগ করো না।

ডকুমেন্ট:
{context}

প্রশ্ন:
{question}

উত্তর বাংলায় সংক্ষেপে দাও:
"""
)

In [None]:
#Memory Intregration
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer")

In [None]:
#Create a chain to connect all
conversational_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=compressed_retriver,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt}
)

In [None]:
#Testing
questions = [
    "অনপুেমর ভাষায় সুপুরুষ কাকে বলা হয়েছে?",
    "কে অনপুেমর ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?",
    "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
]

In [None]:
#testing-1
question = "অনুপেমর ভাষায় সুপুরুষ কােক বলা হেয়েছ?"
result = conversational_chain.invoke({"question": question})
print("Answer:", result["answer"])