Install Required Libraries

In [1]:
%pip install -q groq qdrant-client langchain fastembed pymupdf pymupdf4llm pathlib ipywidgets langchain_qdrant langchain-groq

Note: you may need to restart the kernel to use updated packages.


Import the Libraries

In [2]:
import os
import pymupdf4llm
import pathlib
from langchain.text_splitter import MarkdownTextSplitter

Use PyMuPDF to convert PDFs to markdown and output them into the /parsed_texts folder

In [3]:
def convert_pdf_to_text(pdf_path, output_folder):
    doc = pymupdf4llm.to_markdown(pdf_path)
    output_file = os.path.join(output_folder, os.path.basename(pdf_path).replace('.pdf', '.md'))
    pathlib.Path(output_file).write_bytes(doc.encode())

# Convert all PDFs in the /documents folder
for pdf_file in os.listdir('docs'):
    if pdf_file.endswith('.pdf'):
        convert_pdf_to_text(os.path.join('docs', pdf_file), 'parsed_docs')

Select Documents for Processing

In [4]:
# Path to the directory containing parsed Markdown files
notebook_path = os.getcwd()
docs_path = os.path.join(notebook_path, 'parsed_docs')

# List all Markdown files in the directory
documents = [os.path.join(docs_path, file) for file in os.listdir(docs_path) if file.endswith('.md')]
print(documents)

['c:\\Users\\Andrew\\Documents\\github\\SIT374\\redback-chatbot\\RAG\\parsed_docs\\fed_gov_guide.md']


Break Down the Text into Manageable Chunks

In [5]:
# Initialise the text splitter
text_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)

# Split all documents into chunks
chunks = []
for doc in documents:
    with open(doc, 'r', encoding='utf-8') as file:
        text = file.read()
        chunk = text_splitter.split_text(text)
        chunks.extend(chunk)


Check the Chunks Created

In [6]:
print(len(chunks))
print(chunks[:1])

91
['# Choose Health:\n Be Active\n\n\n##### A physical activity guide for older Australians\n\n**An initiative of the Australian Government in**\n\n**association with Sports Medicine Australia**\n\n\n-----\n\n**Choose Health: Be Active**\n\nFirst printed April 2005\nRevised and reprinted April 2008\nRevised and reprinted June 2008\nISBN 978-1-920720-2856']


Load in Sentence Transformer for Embeddings

In [7]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer("all-MiniLM-L6-v2")


  from tqdm.autonotebook import tqdm, trange


Generate Vector Embeddings for Each Text Chunk

In [8]:
embeddings = encoder.encode(chunks)

Create Points for Qdrant

In [9]:
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid

# Generate unique IDs
ids = [str(uuid.uuid4()) for _ in range(len(chunks))]

# Prepare data for upload
points = [
    PointStruct(
        id=id,
        vector=embedding,
        payload={"text": chunk}  # Optional payload with original text
    )
    for id, embedding, chunk in zip(ids, embeddings, chunks)
]

Check Points

In [10]:
print(points)

[PointStruct(id='9944ed34-5440-4f4f-844a-c5486633c9b9', vector=[0.09337233006954193, 0.0006403689039871097, 0.004278379958122969, 0.09347981214523315, -0.014129129238426685, 0.10976804047822952, 0.06773710995912552, -0.01342514157295227, -0.06488966941833496, 0.12243499606847763, 0.021542487666010857, -0.005429570097476244, -0.037264928221702576, 0.029789436608552933, 0.1242106482386589, 0.04338301718235016, -0.016861775889992714, -0.04878482222557068, 0.0014991781208664179, 0.04467671737074852, -0.04922344908118248, 0.11343425512313843, 0.036400824785232544, 0.05083188787102699, -0.03451304882764816, 0.018106553703546524, -0.020337358117103577, 0.004486601334065199, -0.04106245934963226, 0.026180604472756386, 0.009721888229250908, 0.04482461139559746, 0.0691717341542244, -0.014640933834016323, 0.009073800407350063, -0.051670510321855545, -0.0249942634254694, -0.04249238595366478, -0.12342030555009842, -0.011570905335247517, 0.04192206636071205, -0.07384293526411057, 0.0082094511017203

Set up Qdrant Vector Database

In [11]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

collection_name = 'my_text_chunks'

# Create a collection with specific configuration
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

True

Upsert the Points into the Vector Database

In [12]:
client.upsert(
    collection_name=collection_name,
    wait=True,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

Test Vector Database Search (Retrieval)

In [13]:
def query_qdrant(query_embedding, collection_name=collection_name, top_k=1):
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k,
        with_payload=True
    )
    return [hit.payload for hit in search_result]

# Search for a vector
query_vector = encoder.encode(["Sport and Recreation Tasmania"])[0]
print(query_qdrant(query_vector, collection_name))

[{'text': 'TAS\nSport and Recreation\nTasmania\nPh: 1800 252 476\nSA\nActive Ageing\nPh: 08 8232 9077\nWA\nWA Department of Sport and\nRecreation\nPh: 08 9492 9700\n\n\nNT\nSport and Recreation NT\nPh: 1800 045 678\nQLD\nSport and Recreation QLD\nPh: 07 3237 9832\n\n\nWant to know more about physical activity? Contact:\n\n\n**Heart Health – the National Heart Foundation**\nPh: 1300 362 787\nWebsite: www.heartfoundation.com.au\n\n**Diabetes – Diabetes Australia**\nPh: 1300 136 588\nWebsite: www.diabetesaustralia.com.au'}]


Import Groq API for Langchain

In [None]:
%env GROQ_API_KEY=

In [22]:
from langchain_groq import ChatGroq

# Initialise the ChatGroq model with streaming enabled
chat_model = ChatGroq(model_name='llama3-8b-8192', api_key=os.getenv("GROQ_API_KEY"), streaming=True)

Implement Context Buffer in LangChain

In [24]:
from langchain_core.prompts import PromptTemplate
from langchain.memory.buffer import ConversationBufferMemory

# Initialise the ConversationBufferMemory
memory = ConversationBufferMemory(return_messages=True)

# Define the ChatPromptTemplate for user interaction
template = """Answer the following question from the context

context = {context}

question = {question}

"""
prompt_template = PromptTemplate(input_variables=["context", "question"], template=template)


def generate_response(user_input: str) -> str:
    try:
        query_embedding = encoder.encode(user_input)
        context = query_qdrant(query_embedding)
        #context = "Ignore"
        full_response = chat_model.predict(prompt_template.format(question=user_input, context=context))
        return full_response.strip()
    except Exception as e:
        print(f"An error occurred in generate_response: {str(e)}")
        return f"Error: {str(e)}"

Test without Context

In [20]:
# Example usage
user_query = "What is the phone number of Sport and Recreation NT?"
response = generate_response(user_query)
print(response)

I'm unable to answer that question as I don't have any context or information about Sport and Recreation NT or its phone number.


Test with Context

In [26]:
# Example usage
user_query = "What is the phone number of Sport and Recreation NT?"
response = generate_response(user_query)
print(response)

According to the context, the phone number of Sport and Recreation NT is 1800 045 678.
