In [2]:
pip install faiss-cpu transformers scikit-learn PyPDF2 numpy

Note: you may need to restart the kernel to use updated packages.




In [3]:
import os 
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def load_pdf_documents(directory):
    documents = []
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in range(len(reader.pages)):
                    text += reader.pages[page].extract_text()
                documents.append(text)
                file_names.append(filename)
    return documents, file_names

# Example usage: Load all PDFs from a directory
pdf_directory = r"Numerology"  # Replace with your directory path
documents, file_names = load_pdf_documents(pdf_directory)

In [5]:
#Vectorize Documents and Create FAISS Index

vectorizer=TfidfVectorizer()
doc_vectors=vectorizer.fit_transform(documents).toarray()

#Create a FAISS Index
dimension=doc_vectors.shape[1]
index= faiss.IndexFlatL2(dimension)   #L2 distance for similarity search
index.add(doc_vectors)    #Add vectors to the index
print(f"Indexed {len(doc_vectors)} document Vectors.")

Indexed 8 document Vectors.


In [6]:
#Implement Retrieval using FAISS

def retrieve_relevant_document_faiss(query,index,vectorizer,file_names):
    query_vector=vectorizer.transform([query]).toarray()
    distances, indices =index.search(query_vector, k=1)   # It will retrieve the most similar document
    most_similar_doc_index =indices[0][0]
    return documents[most_similar_doc_index], file_names[most_similar_doc_index]

#Sample test
query="Tell me about Ayurveda"
context, file_name=retrieve_relevant_document_faiss(query, index, vectorizer, file_names)
print(f"Retrieved Document from:{file_name}")
print(f"Document Content: {context[:200]}...")

Retrieved Document from:Numerology the Complete Guide, Volume 2_ Advanced Personality Analysis and Reading the Past, Present and Future ( PDFDrive ).pdf
Document Content: ...


In [7]:
#Load Pre-Trained GPT Models

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def generate_response(query, context):
    # Combine the context and query into a single input for the model
    input_text = f"{context} [SEP] {query}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    # Generate the output
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response



In [9]:
#Implement The RAG Pipeline

def rag_pipeline(query):
    # Step 1: Retrieve the most relevant document
    context, file_name = retrieve_relevant_document_faiss(query, index, vectorizer, file_names)
    print(f"Retrieved Document: {context[:200]}...\n")  # Print the first 200 characters for brevity
    print(f"Document Source: {file_name}\n")
    
    # Step 2: Generate a response using the retrieved document as context
    response = generate_response(query, context)
    return response

# Example usage
query = input("What do you want to know?")
response = rag_pipeline(query)
print(f"Generated Response: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Retrieved Document: ...

Document Source: Numerology the Complete Guide, Volume 2_ Advanced Personality Analysis and Reading the Past, Present and Future ( PDFDrive ).pdf

Generated Response:  [SEP] Tell me about Ayurveda.

Ayurveda is a Sanskrit word meaning "to be born" or "to be born". It is a Sanskrit word meaning "to be born" or "to be born
