In [7]:
# Install required packages
!pip install pymupdf pytesseract pillow langchain sentence-transformers faiss-cpu transformers

import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# 1. Text Extraction Function
def extract_text_from_pdf(pdf_path, languages=['eng', 'hin', 'ben', 'chi_sim']):
    text = ""
    doc = fitz.open(pdf_path)
    
    for page in doc:
        # Try text extraction for digital PDFs
        page_text = page.get_text()
        if page_text.strip():  # Digital PDF
            text += page_text
        else:  # Scanned PDF (OCR)
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            
            # Try OCR with multiple languages
            for lang in languages:
                try:
                    text += pytesseract.image_to_string(img, lang=lang)
                    break
                except:
                    continue
    return text

# 2. Text Processing
def process_text(text):
    # Multilingual text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", "।", "。", "؟", "!"]
    )
    return text_splitter.split_text(text)

# 3. Embedding and Vector Store Setup
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def create_vector_store(chunks):
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# 4. Retrieval and Generation
class RAGSystem:
    def __init__(self, index, chunks):
        self.index = index
        self.chunks = chunks
        self.llm = pipeline(
            "text2text-generation",
            model="google/flan-t5-small",
            max_length=500
        )
    
    def query(self, question, k=3):
        # Retrieve relevant chunks
        query_embedding = model.encode([question])
        _, indices = self.index.search(query_embedding, k)
        
        # Rerank and format context
        context = "\n".join([self.chunks[i] for i in indices[0]])
        
        # Generate answer
        return self.llm(f"answer: {question} context: {context}")[0]['generated_text']

# Usage Example
# Usage Example
if __name__ == "__main__":
    # Specify the absolute path to your PDF file
    file_path = r"C:\Users\sahil\Desktop\project\data\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf"
    file_path = r"C:\Users\sahil\Desktop\project\data\Reboot_Leadership_and_the_Art_of.pdf"
    # Extract text
    text = extract_text_from_pdf(file_path)
    
    # Process text
    chunks = process_text(text)
    
    # Create vector store
    index = create_vector_store(chunks)
    
    # Initialize RAG system
    rag = RAGSystem(index, chunks)
    
    # Query example
    print(rag.query("What is the Blue Ocean Strategy?"))
    print(rag.query("What is the core belief about leadership presented in Reboot: Leadership and the Art of Growing Up?"))



Device set to use cpu


We launch a business and we launch a strategy.
wisdom


In [9]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# Initialize model and vector store
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
retriever_pipeline = pipeline("text2text-generation", model="google/flan-t5-small", max_length=500)

# 1. Optimized Text Extraction

def extract_text_from_pdf(pdf_path, languages=['eng', 'hin', 'ben', 'chi_sim']):
    text = ""
    doc = fitz.open(pdf_path)
    
    for page in doc:
        page_text = page.get_text("text")
        if page_text.strip():
            text += page_text + "\n"
        else:
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            for lang in languages:
                try:
                    text += pytesseract.image_to_string(img, lang=lang) + "\n"
                    break
                except:
                    continue
    return text.strip()

# 2. Efficient Text Processing

def process_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600, chunk_overlap=100,
        separators=["\n\n", "\n", "।", "。", "؟", "!"]
    )
    return text_splitter.split_text(text)

# 3. Optimized Embedding and Vector Storage with Cosine Similarity

def create_vector_store(chunks):
    embeddings = model.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Using Inner Product (cosine similarity)
    index.add(embeddings)
    return index, embeddings

# 4. Enhanced Retrieval and Answer Generation

class RAGSystem:
    def __init__(self, index, chunks, embeddings):
        self.index = index
        self.chunks = chunks
        self.embeddings = embeddings
    
    def query(self, question, k=3):
        query_embedding = model.encode([question], convert_to_numpy=True, normalize_embeddings=True)
        _, indices = self.index.search(query_embedding, k)
        context = "\n".join([self.chunks[i] for i in indices[0]])
        
        prompt = f"Answer the question accurately using the context provided.\n\nQuestion: {question}\nContext: {context}\nAnswer:"
        return retriever_pipeline(prompt)[0]['generated_text']

# Main Execution
if __name__ == "__main__":
    file_path = r"C:\Users\sahil\Desktop\project\data\Reboot_Leadership_and_the_Art_of.pdf"
    
    text = extract_text_from_pdf(file_path)
    chunks = process_text(text)
    index, embeddings = create_vector_store(chunks)
    
    rag = RAGSystem(index, chunks, embeddings)
    
    print(rag.query("What is the Blue Ocean Strategy?"))
    print(rag.query("What is the core belief about leadership presented in Reboot: Leadership and the Art of Growing Up?"))


Device set to use cpu


Survival strategy
An tpn of Dedication
