In [None]:
!pip install marker-pdf
!pip install fpdf

Collecting fpdf
  Using cached fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=8b28d09c20e7599a2a844b7abe221f013242e0143652283fe2e5d2c3996381cd
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
import os
import re
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from fpdf import FPDF  # Use fpdf2 (install with `pip install fpdf2`)

# Directories for input PDFs and output cleaned PDFs
input_directory = '/content/'  # Input folder where your PDF files are located
output_directory = '/content/output_pdfs/'  # Output folder for cleaned PDFs

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Initialize the PDF converter from the marker library
converter = PdfConverter(artifact_dict=create_model_dict())

# Function to clean the extracted text using regex
def clean_text(text):
    # Remove unwanted characters like pipe '|' and triple newlines
    text = re.sub(r'\|', '', text)
    text = re.sub(r'\n\n\n', '\n', text)
    text = re.sub(r'\n\n', '\n', text)
    text = re.sub(r'\u2013|\u2014', '-', text)  # Replace en-dash and em-dash with hyphen
    text = re.sub(r'\u2022|\uf0b7', '-', text)  # Replace bullet points with hyphen
    return text

# Function to save the cleaned text as a new PDF using `fpdf2`
def save_text_as_pdf(text, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Write the text to the PDF
    pdf.multi_cell(0, 10, txt=text)  # Automatically handles newlines

    # Output the PDF to the file
    pdf.output(output_pdf_path)

# Loop through all PDF files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.pdf'):  # Process only PDF files
        file_path = os.path.join(input_directory, filename)

        # Extract text from the PDF using the marker library
        rendered = converter(file_path)
        text, _, _ = text_from_rendered(rendered)

        # Clean the extracted text using regular expressions
        cleaned_text = clean_text(text)

        # Define the output path for the cleaned PDF (save it with a "cleaned_" prefix)
        output_pdf_path = os.path.join(output_directory, f"cleaned_{filename}")

        # Save the cleaned text as a new PDF
        save_text_as_pdf(cleaned_text, output_pdf_path)

        print(f"Processed and saved cleaned PDF: {output_pdf_path}")


Loaded layout model datalab-to/surya_layout on device cpu with dtype torch.float32
Loaded texify model to cpu with torch.float32 dtype
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32
Loaded table recognition model vikp/surya_tablerec on device cpu with dtype torch.float32
Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32


Recognizing layout: 100%|██████████| 3/3 [01:49<00:00, 36.58s/it]
100%|██████████| 3/3 [00:03<00:00,  1.05s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 3/3 [01:45<00:00, 35.17s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5153_FML lab_2024-25 Course Plan.pdf


Recognizing layout: 100%|██████████| 3/3 [02:00<00:00, 40.04s/it]
100%|██████████| 3/3 [00:11<00:00,  3.70s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 3/3 [01:46<00:00, 35.48s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA_5182_PDV_2024-25_Course_File.pdf


Recognizing layout: 100%|██████████| 3/3 [02:10<00:00, 43.65s/it]
100%|██████████| 3/3 [00:11<00:00,  3.84s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 3/3 [02:07<00:00, 42.50s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA_5132_PDV_2024-25_Course_File.pdf


Recognizing layout: 100%|██████████| 3/3 [01:44<00:00, 34.68s/it]
100%|██████████| 3/3 [00:08<00:00,  2.67s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 4/4 [02:28<00:00, 37.21s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5101_Algorithms and Data Structures for Big Data_2024-25-courseFile.pdf


Recognizing layout: 100%|██████████| 2/2 [01:36<00:00, 48.11s/it]
100%|██████████| 2/2 [00:05<00:00,  2.56s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 4/4 [02:27<00:00, 36.82s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5151_ADS Lab_2024-2025-courseFile.pdf


Recognizing layout: 100%|██████████| 3/3 [01:58<00:00, 39.56s/it]
100%|██████████| 3/3 [00:08<00:00,  2.77s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 4/4 [02:49<00:00, 42.26s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5102_Architecture of Big Data Syatems_2023-24-courseFile V1.pdf


Recognizing layout: 100%|██████████| 2/2 [01:35<00:00, 47.56s/it]
100%|██████████| 2/2 [00:06<00:00,  3.19s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 3/3 [02:19<00:00, 46.51s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5152_ABD Lab_2024-2025-courseFile.pdf


Recognizing layout: 100%|██████████| 3/3 [01:51<00:00, 37.07s/it]
100%|██████████| 3/3 [00:03<00:00,  1.12s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 3/3 [01:57<00:00, 39.30s/it]


Processed and saved cleaned PDF: /content/output_pdfs/cleaned_BDA 5103_FML_2024-25 Course Plan.pdf


In [None]:
!pip install -q -U  llama-index==0.11.3 llama-index-llms-groq==0.2.0 llama-index-readers-smart-pdf-loader

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.1/247.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q -U llama-index-vector-stores-chroma llama-index-embeddings-huggingface

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m606.2/606.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00

In [None]:
from llama_index.core.llms import ChatMessage
from llama_index.llms.groq import Groq

llm = Groq(model="llama3-70b-8192", api_key='## Add your API Key')

In [None]:
import os
from llama_index.readers.file import PDFReader

pdf_folder = '/content/output_pdfs'  # Path to your PDF folder
pdf_reader_obj = PDFReader(return_full_document=True)

# Load documents in a loop to handle multiple files
documents = []
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):  # Process only PDF files
        file_path = os.path.join(pdf_folder, filename)
        documents.extend(pdf_reader_obj.load_data(file_path))  # Use extend to add documents to the list

In [None]:
print(f"{len(documents) = }\n")
for doc in documents[:]:
  print(doc.metadata)

len(documents) = 8

{'file_name': 'cleaned_BDA 5103_FML_2024-25 Course Plan.pdf'}
{'file_name': 'cleaned_BDA 5152_ABD Lab_2024-2025-courseFile.pdf'}
{'file_name': 'cleaned_BDA_5132_PDV_2024-25_Course_File.pdf'}
{'file_name': 'cleaned_BDA 5102_Architecture of Big Data Syatems_2023-24-courseFile V1.pdf'}
{'file_name': 'cleaned_BDA 5101_Algorithms and Data Structures for Big Data_2024-25-courseFile.pdf'}
{'file_name': 'cleaned_BDA 5153_FML lab_2024-25 Course Plan.pdf'}
{'file_name': 'cleaned_BDA_5182_PDV_2024-25_Course_File.pdf'}
{'file_name': 'cleaned_BDA 5151_ADS Lab_2024-2025-courseFile.pdf'}


In [None]:
type(documents[0])

In [None]:
# concatiate the text from pages (documents) into a single string
full_text = ""
for doc in documents:
  full_text += doc.text + "\n"

print(full_text[:500])

# **Master of Engineering - ME (Big Data Analytics)**
 Course Name  :  Fundamentals of Machine Learning 
 ---  ---  --- 
 Course Code  :  BDA 5103 
 Academic Year  :  2024 - 25 
 Semester  : I   
 Name of the Course Coordinator  :  Dr. Arockiaraj S 
 Name of the Program Coordinator  :  Dr. Prathviraj N 
#### **Course File**
 Signature of Program Coordinator  Signature of Course Coordinator 
 ---  --- 
 with Date  with Date 
 1.  Course Plan 5 
 ---  --- 
 1.1  Primary Information 5 
 1.2  Course


In [None]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import TextNode

text_parser = TokenTextSplitter(
    chunk_size=128,
    chunk_overlap=8
)

chunks = text_parser.split_text(text=full_text)

len(chunks)

202

In [None]:
# convert chunks into llama nodes
nodes = []
for chunk_text in chunks:
  node = TextNode(text=chunk_text)
  nodes.append(node)

In [None]:
# load the embedding model from hugging face
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

# Create embeddings for the chunks
for node in tqdm(nodes):
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

100%|██████████| 202/202 [01:42<00:00,  1.98it/s]


In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Create a collection called "manipal_docs" in chromadb where our chunks
# can be stored
db = chromadb.EphemeralClient()
chroma_collection = db.get_or_create_collection("MSISBDA")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes=nodes, storage_context=storage_context, embed_model=embed_model
)

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever
# Create a retriever object
retriever = index.as_retriever(similarity_top_k=10)
# OR
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
top_chunks = retriever.retrieve("What is the courses offered at BDA?")
print(len(top_chunks))

10


In [None]:
print(top_chunks[0].text)
print(top_chunks[1].text)
print(top_chunks[2].text)
print(top_chunks[3].text)
print(top_chunks[4].text)
print(top_chunks[5].text)
print(top_chunks[6].text)
print(top_chunks[7].text)
print(top_chunks[8].text)
print(top_chunks[9].text)

--- 
 Course Code  :  BDA 5102 
 Academic Year  :  2024 - 2025 
 Semester  : I   
 Name of the Course Coordinator  :  Mr. DEEPAK RAO B 
 Name of the Program Coordinator  :  Dr. PRATHVIRAJ N 
 Signature of Program Coordinator  Signature of Course Coordinator 
 ---  --- 
 with Date  with Date 
 1.  Course Plan 5 
 ---  --- 
 1.1  Primary Information 5 
 1.2  Course Outcomes (COs) Error! Bookmark not
Principles of Data Visualization 
 ---  ---  --- 
 Course Code  :  BDA 5132 
 Academic Year  :  2024 - 25 
 Semester  :  I 
 Name of the Course Coordinator  :  SATYANARAYAN SHENOY 
 Name of the Program Coordinator  :  Dr. PRATHVIRAJ N 
### Course File
 Signature of Program Coordinator  Signature of Course Coordinator 
 ---  --- 
 with Date  with Date 
   Table of Contents 
 ---  --- 
 1.  Course Plan 6 
 1.1  Primary Information 6
:  BDA 5101 
 Academic Year  :  2024 - 2025 
 Semester  :  I 
 Name of the Course Coordinator  :  Mr. DEEPAK RAO B 
 Name of the Program Coordinator  :  Dr. PRATHVI

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import get_response_synthesizer

from llama_index.core import PromptTemplate

# Create prompt
template = (
        "Context information is below.\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "Given the context information and not prior knowledge, "
        "answer the query.\n"
        "Query: {query_str}\n"
        "Answer: "
)
qa_template = PromptTemplate(template)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(llm, text_qa_template = qa_template)

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.4)]
)

# query
response = query_engine.query("What are the courses names at BDA")
print(response)

Based on the provided context information, the course names at BDA (Big Data Analytics) are:

1. Fundamentals of Machine Learning (BDA 5103)
2. Algorithm and Data Structures for Big Data Lab (BDA 5151)
3. Fundamentals of Machine Learning Lab (BDA 5153)
4. Principles of Data Visualization (BDA 5132)
5. Principles of Data Visualization Lab (BDA 5182)
6. Architecture of Big Data Systems Lab (BDA 5152)

Let me know if you have any further queries!


In [None]:
response = query_engine.query("who is the course coodinatre of BDA")
print(response)

Based on the context information, the course coordinators for each BDA course are:

* BDA 5101: Mr. DEEPAK RAO B
* BDA 5102: Mr. DEEPAK RAO B
* BDA 5103: Dr. Arockiaraj S
* BDA 5132: SATYANARAYAN SHENOY
* BDA 5151: Mr. DEEPAK RAO B
* BDA 5153: Dr. Arockiaraj S
* BDA 5182: SATYANARAYAN SHENOY

So, the answer is: it depends on the specific BDA course.


In [None]:
response = query_engine.query("who is the program coodinatre of BDA")
print(response)

Based on the context information, the answer is: Dr. PRATHVIRAJ N.


In [33]:
# worng response
response = query_engine.query("what are topic are covered in FML?")
print(response)

Based on the provided context information, the topics covered in FML (Fundamentals of Machine Learning) are not explicitly mentioned. However, we can infer that FML is a lab session, and it is part of the Big Data Analytics course.

Looking at the other courses mentioned, we can see that they cover topics such as HDFS, SQOOP, HIVE, Map-Reduce, Spark, Data Frames, Data Streaming, Linked List, Stack, Queue, Trees, Searching & Sorting, Hash tables, Graphs, Web scraping, Data Analysis, and Data Visualization.

Although we can't pinpoint the exact topics covered in FML, it is likely that they are related to Machine Learning fundamentals, given the course name.


In [34]:
response = query_engine.query("what are topic are covered in Fundamentals of Machine Learning?")
print(response)

Based on the provided context information, the topics covered in Fundamentals of Machine Learning are:

1. Overview of Supervised (regression and classification), unsupervised (clustering and dimensionality reduction), semi-supervised, and reinforcement learning with practical examples.
2. Machine learning nomenclature: raw data, types of features and outputs, feature vector.
3. Decision tree model of learning. Classification and regression using decision trees.
4. Splitting criteria: entropy, information gain, Gini impurity.
5. Overfitting in decision trees.
6. Pruning in decision trees.
7. Linear regression: model, estimation, and interpretation of coefficients.
8. Introduction to bias/variance trade-off.
9. Regularized linear regression.
10. K-nearest neighbours algorithm.
11. Cross-validation.
12. Dimension reduction using principal component analysis (PCA).

These topics are covered in the 15 lessons (L0 to L15) outlined in the course plan.


In [35]:
response = query_engine.query("explain Overfitting in decision trees.")
print(response)

Based on the context information, Overfitting in decision trees is a topic covered in Lesson 7 (L7) and Lesson 15 (L15) of the course. 

Overfitting in decision trees occurs when a decision tree model is too complex and learns the noise in the training data rather than the underlying patterns. This results in the model performing well on the training data but poorly on new, unseen data. 

In decision trees, overfitting can happen when the tree is allowed to grow too deep, resulting in nodes that contain very few instances. This can lead to the model memorizing the training data rather than generalizing to new data. 

Some common symptoms of overfitting in decision trees include:

* High accuracy on the training data but low accuracy on test data
* A complex tree with many nodes and branches
* The model is sensitive to small changes in the training data

To avoid overfitting in decision trees, techniques such as pruning, regularization, and cross-validation can be used. Pruning involves

In [36]:
response = query_engine.query("what the are References materials for Fundamentals of Machine Learning")
print(response)

Based on the context information, the reference materials for Fundamentals of Machine Learning are:

1. Module: Introduction to Machine Learning (https://www.intel.com/content/www/us/en/developer/tools/oneapi/training/academicprogram/educators/intro-machine-learning-training-kit.html)
2. Module: Get started with AI on Azure (https://learn.microsoft.com/en-us/training/modules/get-started-ai-fundamentals/)
3. Module: Microsoft Azure AI Fundamentals: Get started with artificial intelligence (https://learn.microsoft.com/en-us/training/paths/getstarted-with-artificial-intelligence-on-azure/)
4. Learning path: Understand data science for machine learning (https://learn.microsoft.com/en-us/training/paths/understand-machinelearning/)
5. Module: Generative AI with Large Language Models (https://www.coursera.org/learn/generative-ai-with-llms)
6. Grokking Machine Learning, Luis G. Serrano, Manning Publications; 1st Edition, 2019 (https://www.manning.com/books/grokking-machine-learning)
7. A Cours

In [None]:
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile APP.py
import os
import streamlit as st
from llama_index.core.llms import ChatMessage
from llama_index.llms.groq import Groq
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from tqdm import tqdm
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import get_response_synthesizer
from llama_index.core import PromptTemplate

# Initialize the LLM and other variables
llm = Groq(model="llama3-70b-8192", api_key='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')

# Initialize PDF Reader
pdf_folder = '/content/output_pdfs'  # Path to your PDF folder
pdf_reader_obj = PDFReader(return_full_document=True)

# Load documents in a loop to handle multiple files
documents = []
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):  # Process only PDF files
        file_path = os.path.join(pdf_folder, filename)
        documents.extend(pdf_reader_obj.load_data(file_path))  # Use extend to add documents to the list

# Concatenate the text from pages (documents) into a single string
full_text = ""
for doc in documents:
    full_text += doc.text + "\n"

# Split the text into smaller chunks
text_parser = TokenTextSplitter(chunk_size=128, chunk_overlap=8)
chunks = text_parser.split_text(text=full_text)

# Convert chunks into Llama nodes
nodes = [TextNode(text=chunk_text) for chunk_text in chunks]

# Load the embedding model from Hugging Face
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# Create embeddings for the chunks
for node in tqdm(nodes):
    node_embedding = embed_model.get_text_embedding(node.get_content(metadata_mode="all"))
    node.embedding = node_embedding

# Create a collection in ChromaDB
db = chromadb.EphemeralClient()
chroma_collection = db.get_or_create_collection("MSISBDA")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create a vector store index
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, embed_model=embed_model)

# Create a retriever object
retriever = VectorIndexRetriever(index=index, similarity_top_k=10)

# Create a prompt template for the chatbot
template = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

qa_template = PromptTemplate(template)

# Configure response synthesizer
response_synthesizer = get_response_synthesizer(llm, text_qa_template=qa_template)

# Assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.4)]
)

# Streamlit UI
def main():
    st.title("Welcome to MSIS BDA Chatbot")
    st.write("Please enter your query:")

    query = st.text_input("Query")

    if query:
        # Process the query and get the response
        response = query_engine.query(query)
        st.write("Answer: ")
        st.write(response)

if __name__ == "__main__":
    main()

Writing APP.py


In [30]:
!streamlit run "/content/APP.py"


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.44.124.7:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
