In [17]:
import fitz 
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import os
import uuid   #To generate UUIDs for the documents
import numpy as np

In [18]:
#Function to extract text from PDF Files
def extract_text_from_pdf(pdf_path):
    text=""
    doc =fitz.open(pdf_path)
    for page in doc:
        text+=page.get_text()
    return text

In [21]:
#Initialization of Qdrant Client
qdrant=QdrantClient(":memory:") # you can specify the path to a storage as well

#Define the collection name
collection_name="document_embeddings"

#Define the embedding dimensionality 
embedding_dim=384

#create a collection in Qdrant 
qdrant.create_collection(
    collection_name=collection_name,vectors_config=VectorParams(size=embedding_dim,distance=Distance.COSINE),
)

True

In [22]:
#Load a model for generating embeddings
embedder=SentenceTransformer('all-MiniLM-L6-v2')

#Path of the documents
pdf_dir=r"D:\\intership\\Training\\assigment\\rag\\brochures"    #the path where the documents are stored

#Iteration over PDF files and extract text
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path=os.path.join(pdf_dir,pdf_file)
        document_text=extract_text_from_pdf(pdf_path)

        #Create embedding for the document text
        document_embedding=embedder.encode([document_text])

        #Generate a UUID for the file
        doc_id=str(uuid.uuid5(uuid.NAMESPACE_DNS,pdf_file))  #Generates a UUID based on file name

        #Defining a PointStruct Object. It represent a point in the vector space that can be stored in the Qdrant database. we define three information. 
        #Id == Document id, Vector = the embedding which represents the document in vector space, Payload = information or metadata about the document
        point = PointStruct(
            id=doc_id,
            vector=document_embedding[0].tolist(),
            payload={"document":document_text,"source":pdf_file}
        )

        #Insert the document into Qdrant
        qdrant.upsert(
            collection_name=collection_name, points=[point],
        )



In [23]:
#Function for Query Expansion
query_expansion_pipeline=pipeline('text-generation',model='gpt2')

def expand_query(query,max_length=50):
    expanded_query=query_expansion_pipeline(query,max_length=max_length,num_return_sequences=1, pad_token_id=50256)[0]['generated_text']   # Pad token id specifies the id for the padding token. It marks the end of sequence. 
    return expanded_query

In [27]:
#Retrievel of the Documents

def retrieve_relevant_document(query, qdrant, collection_name,embedder, top_k=3):
    expanded_query=expand_query(query)

    #Create embedding for the expanded query
    query_embedding=embedder.encode([expanded_query])

    #Search in Qdrant for similar documents
    search_result=qdrant.search(
        collection_name=collection_name, query_vector=query_embedding[0].tolist(),
        limit=top_k
    )

    #Extract the most relevant document and their source
    retrieved_documents=[(result.payload['document'],result.payload['source']) for result in search_result]
    return retrieved_documents

In [30]:
#Function to Generate a Response

tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
model=GPT2LMHeadModel.from_pretrained('gpt2')

def generate_response(query,qdrant,collection_name,embedder, model, tokenizer):
    retrieved_docs=retrieve_relevant_document(query,qdrant, collection_name,embedder)

    #Concatenate the retrieved documents to form the context
    context=" ".join([doc[0] for doc in retrieved_docs])
    sources= [doc[1] for doc in retrieved_docs]

    #Prepare the input for the language model
    input_text= f"{context} [SEP] {qdrant}"
    inputs=tokenizer(input_text,return_tensors="pt", max_length=512, truncation=True)

    #Generate Response
    output=model.generate(**inputs,max_length=600, pad_token_id=tokenizer.eos_token_id)

    #Decode the generated text
    response=tokenizer.decode(output[0],skip_special_tokens=True)

    return response, sources


In [31]:
query=input("How may I help you?")
response, sources=generate_response(query,qdrant,collection_name,embedder,model,tokenizer)

print(f"Generated Response:: {response}")
print(f"Sources:{sources}")

Generated Response:: Margie’s Travel Presents… 
Dubai 
 
 
Dubai is the largest and most populous city in the United 
Arab Emirates. It is located on the southeast coast of the 
Persian Gulf and is the capital of the Emirate of Dubai, 
one of the seven emirates that make up the country. Abu 
Dhabi and Dubai are the only two emirates to have veto 
power over critical matters of national importance in the 
country's Federal Supreme Council. The city of Dubai is 
located on the emirate's northern coastline and heads the 
Dubai-Sharjah-Ajman metropolitan area. 
 
 
 
 
Dubai Hotels 
Margie’s Travel offers the following accommodation 
options in Dubai: 
The Creek Hotel 
Friendly boutique hotel within the heart of the bustling 
Dubai Creek area. 
The Deira Hotel 
Family-run hotel in Dubai’s traditional commercial 
center. 
The Lost City Hotel 
Luxurious accommodation in Dubai, with onsite 
waterpark and aquarium. 
 
 
 
To book your trip to Dubai, visit www.margiestravel.com 
 
  
 
 
 
 
 


Advanced RAG Implementation using Query Expansion with Qdrant Vector db 