In [None]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np # We'll use numpy to inspect the shape of our embeddings
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

In [None]:

def extract_text_from_pdf(pdf_path):
    """
    This function takes the path to a PDF file and returns its entire text content.
    """
    try:
        document = fitz.open(pdf_path)
        
        full_text = ""
        
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            full_text += page.get_text()
            
        return full_text

    except Exception as e:
        return f"An error occurred while reading the file: {e}"


file_name = input ("enter the path to the PDF file: ")

print(f"Extracting text from file: {file_name}...")

content = extract_text_from_pdf(file_name)

print("---------- Start of Content ----------")
print(content)
print("---------- End of Content ----------")
print("\nProcess completed successfully!")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len
)
text_chunks = text_splitter.split_text(content)

print(f"The text has been split into {len(text_chunks)} chunks.")
print("\n--- Example of the first chunk: ---\n")
print(text_chunks[0])

In [None]:
print("Loading the embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

print("Generating embeddings for all text chunks...")
embeddings = model.encode(text_chunks)
print("Embeddings generated successfully.")

print(f"\nShape of our embeddings matrix: {np.shape(embeddings)}")
print(f"Number of text chunks: {len(text_chunks)}")
print(f"Dimension of each embedding vector: {len(embeddings[0])}")

print("\n--- Example of the first embedding vector (first 5 values): ---")
print(embeddings[:])

In [None]:
print("Starting to build the FAISS vector store... This might take a moment.")

embedding_function = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') 
vector_store = FAISS.from_texts(texts=text_chunks, embedding=embedding_function) 
print("Vector store has been built successfully in memory!")

VECTOR_STORE_PATH = "faiss_index"
vector_store.save_local(VECTOR_STORE_PATH)

print(f"Vector store has been saved locally to the '{VECTOR_STORE_PATH}' folder.")

In [None]:
retriever = vector_store.as_retriever(search_kwargs={"k": 5}) 

query = input("Enter your question : ")
print(f"Searching for relevant documents for: \"{query}\"\n")
relevant_docs = retriever.get_relevant_documents(query)

print("--- Found the following relevant documents (Top 5): ---\n")
for i, doc in enumerate(relevant_docs):
    print(f"--- Document {i+1} ---\n")
    print(doc.page_content)
    print("\n" + "-"*50 + "\n")

print("--- Project Complete: Semantic Retrieval Successful! ---")