In [5]:
# Install required packages
!pip install pymupdf pytesseract pillow langchain sentence-transformers faiss-cpu transformers torch torchvision torchaudio

import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import faiss
import numpy as np

# 1. Text Extraction Function
def extract_text_from_pdf(pdf_path, languages=['eng', 'hin', 'ben', 'chi_sim']):
    text = ""
    doc = fitz.open(pdf_path)
    
    for page in doc:
        # Try text extraction for digital PDFs
        page_text = page.get_text("text")
        if page_text.strip():  # Digital PDF
            text += page_text + "\n"
        else:  # Scanned PDF (OCR)
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            
            # Try OCR with multiple languages
            for lang in languages:
                try:
                    text += pytesseract.image_to_string(img, lang=lang)
                    break
                except Exception as e:
                    print(f"Error using language {lang}: {e}")
    
    return text.strip()

# 2. Text Processing (Improved chunking)
def process_text(text):
    # Use sentence-based chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,  # Reduced to capture more relevant info per chunk
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", "?", "!", "।", "。", "？"]
    )
    return text_splitter.split_text(text)

# 3. Embedding and Vector Store Setup
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def create_vector_store(chunks):
    embeddings = model.encode(chunks, normalize_embeddings=True)  # Normalized embeddings for cosine similarity
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (for Cosine Similarity)
    index.add(embeddings)
    return index, embeddings

# 4. Retrieval and Generation (Improved Retrieval)
class RAGSystem:
    def __init__(self, index, embeddings, chunks):
        self.index = index
        self.embeddings = embeddings
        self.chunks = chunks
        self.llm = pipeline("text2text-generation", model="google/flan-t5-small", max_length=500)

    def query(self, question, k=3):
        # Retrieve top-k relevant chunks using cosine similarity
        query_embedding = model.encode([question], normalize_embeddings=True)
        _, indices = self.index.search(query_embedding, k)
        
        # Rerank based on cosine similarity scores
        retrieved_chunks = [self.chunks[i] for i in indices[0]]
        similarities = [util.cos_sim(query_embedding, self.embeddings[i]) for i in indices[0]]
        sorted_chunks = [chunk for _, chunk in sorted(zip(similarities, retrieved_chunks), reverse=True)]

        # Format context properly
        context = "\n".join(sorted_chunks)

        # Generate response with better prompt engineering
        prompt = f"Answer the following question based on the provided context:\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
        
        return self.llm(prompt)[0]['generated_text']

# Usage Example
if __name__ == "__main__":
    # Specify the absolute path to your PDF file
    file_path = r"C:/Users/sahil/Desktop/project/data/en/Reboot_Leadership_and_the_Art_of.pdf"
    file_path = r"C:/Users/sahil/Desktop/project/data/zh/1553a07b-9f53-4e8b-9987-ae714000b95b.pdf"

    # Extract text
    text = extract_text_from_pdf(file_path)
    
    # Process text
    chunks = process_text(text)
    
    # Create vector store
    index, embeddings = create_vector_store(chunks)
    
    # Initialize RAG system
    rag = RAGSystem(index, embeddings, chunks)
    
    # Query example
    #print(rag.query("What is the core belief about leadership presented in Reboot: Leadership and the Art of Growing Up?"))




Device set to use cpu


CNCERN/CC,, , , , , , , 


In [13]:
print(rag.query("What is the core belief about leadership presented in Reboot: Leadership and the Art of Growing Up?"))


Leadership is the art of growing up


In [15]:
print(rag.query("What role does self-inquiry play in leadership according to the book?"))

Lead from the place of your truest self


In [17]:
print(rag.query(" What is the significance of ‘taking your seat’ in leadership?"))

It is a journey of self-actualization
