In [2]:
import os
import PyPDF2

#Function to Load pdf documents
def load_pdf_documents(directory):
    documents =[]
    file_names =[]
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path=os.path.join(directory,filename)
            with open(file_path,'rb') as file:
                reader=PyPDF2.PdfReader(file)
                text=''
                for page in range(len(reader.pages)):
                    text += reader.pages[page].extract_text()
                documents.append(text)
                file_names.append(filename)
    return documents, file_names

pdf_directory = r"D:\brochures" 
documents, file_names= load_pdf_documents(pdf_directory)


In [3]:
#Intialize the Vectorization and fit the documents

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer= TfidfVectorizer()
doc_vectors =vectorizer.fit_transform(documents)

In [5]:
# Vectorize the Query, Compute Cosine Similarity between the query vector and document vector, Fetch the index of most relevant document vector

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def retrieve_relevant_document(query,doc_vectors,vectorizer,documents,file_names):
    #Vectorizing the query
    query_vector = vectorizer.transform([query])
    #Compute Cosine Similarities 
    similarities =cosine_similarity(query_vector,doc_vectors).flatten()
    #Get the index of the most similar document
    most_similar_doc_index =np.argmax(similarities)
    return documents[most_similar_doc_index], file_names[most_similar_doc_index]

In [7]:
#Load Pre trained model and tokenizer

from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer =GPT2Tokenizer.from_pretrained('gpt2')
model=GPT2LMHeadModel.from_pretrained('gpt2')

def generate_response(query,context):
#Combine the context and query into a single input for the model
    input_text = f"{context} [SEP] {query}"
    input_ids =tokenizer.encode(input_text, return_tensors='pt')

#Generate the output
    output=model.generate(input_ids, max_length=50, num_return_sequences=1)
    response= tokenizer.decode(output[0], skip_special_tokens=True)
    return response

#RAG Pipeline

def rag_pipeline(query):
    #Step 1: Retrieving the most relevant document
    context, file_name = retrieve_relevant_document(query, doc_vectors,vectorizer, documents, file_names)
    print(f"Retrieved Document: {context[:200]}....\n")  # print first 200 characters of the document
    print(f"Document Source:{file_name}\n")

    # Step 2: Generate a response using the retrieved document as context
    response=generate_response(query,context)
    return response

query =input("Hi. I am your Travel assistant. How can I help you?")
response= rag_pipeline(query)
print(f"Generated Response:{response}")




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Retrieved Document:  
  
 
 
 
 
 
 
Wherever you want to go, Margie’s Travel can get you there ! 
Margie’s Travel is a world -leading travel agency, combining international reach with local 
expertise. Just tell us wher....

Document Source:Margies Travel Company Info.pdf

Generated Response: 
  
 
 
 
 
 
 
Wherever you want to go, Margie’s Travel can get you there! 
Margie’s Travel is a world -leading travel agency, combining international reach with local 
expertise. Just tell us where you want to go, and we can arrange:  
• Flights  
• Accommodation  
• Transfers  • Visas  
• Currency Exchange  
• Excursions  
 
Where We Go  
While we can arrange travel to anywhere  
worldwide, we specialize in trips to:  
• Dubai  
• Las Vegas  
• London  
• New York  
• San Francisco  Who We Are  
Margie’s Travel employs some of the 
best travel experts in the world. Our 
leadership team consists of:  
• Marjorie  Long  (CEO)  
• Logan  Reid  (CFO)  
• Emma  Luffman  (CTO)  
• Deepak  Nadar  (St