In [None]:
!pip install marker-pdf
!pip install fpdf

In [None]:
!pip install -q -U  llama-index==0.11.3 llama-index-llms-groq==0.2.0 llama-index-readers-smart-pdf-loader

In [None]:
!pip install -q -U llama-index-vector-stores-chroma llama-index-embeddings-huggingface

In [None]:
import os
import re
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from fpdf import FPDF  # Use fpdf2 (install with `pip install fpdf2`)

# Directories for input PDFs and output cleaned PDFs
input_directory = '/content/'  # Input folder where your PDF files are located
output_directory = '/content/output_pdfs/'  # Output folder for cleaned PDFs

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Initialize the PDF converter from the marker library
converter = PdfConverter(artifact_dict=create_model_dict())

# Function to clean the extracted text using regex
def clean_text(text):
    # Remove unwanted characters like pipe '|' and triple newlines
    text = re.sub(r'\|', '', text)
    text = re.sub(r'\n\n\n', '\n', text)
    text = re.sub(r'\n\n', '\n', text)
    text = re.sub(r'\u2013|\u2014', '-', text)  # Replace en-dash and em-dash with hyphen
    text = re.sub(r'\u2022|\uf0b7', '-', text)  # Replace bullet points with hyphen
    return text

# Function to save the cleaned text as a new PDF using `fpdf2`
def save_text_as_pdf(text, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Write the text to the PDF
    pdf.multi_cell(0, 10, txt=text)  # Automatically handles newlines

    # Output the PDF to the file
    pdf.output(output_pdf_path)

# Loop through all PDF files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.pdf'):  # Process only PDF files
        file_path = os.path.join(input_directory, filename)

        # Extract text from the PDF using the marker library
        rendered = converter(file_path)
        text, _, _ = text_from_rendered(rendered)

        # Clean the extracted text using regular expressions
        cleaned_text = clean_text(text)

        # Define the output path for the cleaned PDF (save it with a "cleaned_" prefix)
        output_pdf_path = os.path.join(output_directory, f"cleaned_{filename}")

        # Save the cleaned text as a new PDF
        save_text_as_pdf(cleaned_text, output_pdf_path)

        print(f"Processed and saved cleaned PDF: {output_pdf_path}")

In [None]:
from llama_index.core.llms import ChatMessage
from llama_index.llms.groq import Groq

llm = Groq(model="llama3-70b-8192", api_key='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')

In [None]:
import os
from llama_index.readers.file import PDFReader

pdf_folder = '/content/output_pdfs'  # Path to your PDF folder
pdf_reader_obj = PDFReader(return_full_document=True)

# Load documents in a loop to handle multiple files
documents = []
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):  # Process only PDF files
        file_path = os.path.join(pdf_folder, filename)
        documents.extend(pdf_reader_obj.load_data(file_path))  # Use extend to add documents to the list

In [None]:
print(f"{len(documents) = }\n")
for doc in documents[:]:
  print(doc.metadata)

In [None]:
type(documents[0])

In [None]:
# concatiate the text from pages (documents) into a single string
full_text = ""
for doc in documents:
  full_text += doc.text + "\n"

print(full_text[:500])

In [None]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import TextNode

text_parser = TokenTextSplitter(
    chunk_size=128,
    chunk_overlap=8
)

chunks = text_parser.split_text(text=full_text)

len(chunks)

In [None]:
# convert chunks into llama nodes
nodes = []
for chunk_text in chunks:
  node = TextNode(text=chunk_text)
  nodes.append(node)

In [None]:
# load the embedding model from hugging face
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [None]:
from tqdm import tqdm

# Create embeddings for the chunks
for node in tqdm(nodes):
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Create a collection called "manipal_docs" in chromadb where our chunks
# can be stored
db = chromadb.EphemeralClient()
chroma_collection = db.get_or_create_collection("MSISBDA")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes=nodes, storage_context=storage_context, embed_model=embed_model
)

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
# Create a retriever object
retriever = index.as_retriever(similarity_top_k=10)
# OR
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
top_chunks = retriever.retrieve("What is the courses offered at BDA?")
print(len(top_chunks))

In [None]:
print(top_chunks[0].text)
print(top_chunks[1].text)
print(top_chunks[2].text)
print(top_chunks[3].text)
print(top_chunks[4].text)
print(top_chunks[5].text)
print(top_chunks[6].text)
print(top_chunks[7].text)
print(top_chunks[8].text)
print(top_chunks[9].text)

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import get_response_synthesizer

from llama_index.core import PromptTemplate

# Create prompt
template = (
        "Context information is below.\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "Given the context information and not prior knowledge, "
        "answer the query.\n"
        "Query: {query_str}\n"
        "Answer: "
)
qa_template = PromptTemplate(template)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(llm, text_qa_template = qa_template)

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.4)]
)

# query
response = query_engine.query("What are the courses names at BDA")
print(response)