In [1]:
import PyPDF2
import re

class PDFProcessor:
    def __init__(self, pdf_path, chunk_size=512, overlap=100):
        self.pdf_path = pdf_path
        self.chunk_size = chunk_size
        self.overlap = overlap
        
    def extract_text_from_pdf(self):
        """Extract text from PDF while preserving column structure"""
        text_by_page = []
        
        with open(self.pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                # Initial cleaning during extraction
                text = self.initial_clean(text)
                text_by_page.append(text)
        
        return text_by_page
    
    def initial_clean(self, text):
        """Initial cleaning of extracted text"""
        # Remove page numbers and headers
        text = re.sub(r'Page \d+ from \d+', '', text)
        text = re.sub(r'Hallesche Krankenversicherung.*?\n', '', text)
        text = re.sub(r'The German version is legally binding.*?\n', '', text)
        text = re.sub(r'PM 22u-e.*?\n', '', text)
        
        # Fix hyphenation issues
        text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', text)
        
        # Clean up extra spaces and line breaks
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()

    def format_section_content(self, text):
        """Format section content for better readability"""
        # Fix common formatting issues
        text = re.sub(r'(?<=\d)\s+(?=[a-z])', '', text)  # Fix space between number and letter in section references
        text = re.sub(r'\s+([.,;:])', r'\1', text)  # Fix spacing before punctuation
        text = re.sub(r'(\d)\s+%', r'\1%', text)  # Fix percentage formatting
        
        # Improve paragraph breaks
        text = re.sub(r'(?<=[.!?])\s+(?=[A-Z])', '\n\n', text)
        
        # Clean up any remaining multiple spaces or newlines
        text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
        text = re.sub(r' +', ' ', text)
        
        return text.strip()

    
    def process_sections(self, text):
        """Split text into sections while handling continuations"""
        # Updated pattern to catch both main sections and subsections
        section_pattern = r'(§\s*\d+[a-z]?\s+[A-Za-z][^§]+?)(?=§\s*\d+|$)'
        sections = re.findall(section_pattern, text, re.DOTALL)
        
        processed_sections = []
        current_section = None
        
        for section in sections:
            # Extract section number and title
            section_match = re.match(r'(§\s*\d+[a-z]?)\s+([A-Z][^I\n]+)', section)
            if section_match:
                section_num, section_title = section_match.groups()
                
                # Clean and format section content
                content = section[section_match.end():].strip()
                content = self.format_section_content(content)
                
                # Combine parts marked with Roman numerals
                if re.search(r'\bII\b|\bIII\b|\bIV\b', content):
                    if current_section and current_section['number'] == section_num:
                        current_section['content'] += f"\n\n{content}"
                    else:
                        current_section = {
                            'number': section_num,
                            'title': section_title,
                            'content': content
                        }
                        processed_sections.append(current_section)
                else:
                    current_section = {
                        'number': section_num,
                        'title': section_title,
                        'content': content
                    }
                    processed_sections.append(current_section)
        
        return processed_sections

    


    def create_chunks(self, sections):
        """Create overlapping chunks while preserving section integrity"""
        chunks = []
        chunk_metadata = []
        current_chunk = ""
        current_length = 0
        
        for section in sections:
            # Format section header
            section_header = f"{section['number']} {section['title']}"
            
            # Split content into paragraphs
            paragraphs = section['content'].split('\n\n')
            current_chunk = []
            current_length = 0
            
            for para in paragraphs:
                para_words = len(para.split())
                
                if current_length + para_words <= self.chunk_size:
                    current_chunk.append(para)
                    current_length += para_words
                else:
                    # Save current chunk if it exists
                    if current_chunk:
                        chunk_text = '\n\n'.join(current_chunk)
                        chunks.append(f"{section_header}\n\n{chunk_text}")
                        print("chunks", chunks)
                        chunk_metadata.append(section_header)
                    
                    # Start new chunk
                    current_chunk = [para]
                    current_length = para_words
            
            # Add final chunk from section
            if current_chunk:
                chunk_text = '\n\n'.join(current_chunk)
                chunks.append(f"{section_header}\n\n{chunk_text}")
                chunk_metadata.append(section_header)
        
        return chunks, chunk_metadata

    def process(self):
        """Main processing pipeline"""
        print("1. Extracting text from PDF...")
        text_pages = self.extract_text_from_pdf()
        
        print("2. Processing text...")
        full_text = ' '.join(text_pages)
        
        print("3. Splitting into sections...")
        sections = self.process_sections(full_text)
        print(f"Found {len(sections)} sections")
        
        print("4. Creating chunks...")
        chunks, chunk_metadata = self.create_chunks(sections)
        print(f"Created {len(chunks)} chunks")
        
        return chunks, chunk_metadata

pdf_path = 'C:/Users/Admin/Semantic/hallesche_policy.pdf'  # Replace with your PDF path

# Initialize processor
processor = PDFProcessor(pdf_path)

# Process the PDF
chunks, chunk_metadata = processor.process()

# Print first few chunks
print("\nFirst 3 Chunks:")
for i in range(min(60, len(chunks))):
    print(f"\nChunk {i+1} Metadata: {chunk_metadata[i]}")
    print(chunks[i])
    print("-" * 80)





1. Extracting text from PDF...
2. Processing text...
3. Splitting into sections...
Found 61 sections
4. Creating chunks...
chunks ['§ 1 Subject, scope and scope of application of insurance coverage ................................ ............. 2 \n\n', '§ 2 Start of insurance coverage .............................. 3 \n\n', '§ 3 Waiting periods ................................ ................... 4 \n\n', '§ 4 Scope of the duty to render benefits ............. 5 \n\n', '§ 5 Restriction of the duty to render benefits .... 7 \n\n', '§ 6 Disbursement of the insurance benefits ....... 9 \n\n', '§ 7 End of insurance coverage ................................ 9 \n\n', '§ 8 Payment of premiums ................................ ...... 10 \n\n', '§ 8a Calculation of premiums ................................ . 11 \n\n', '§ 8b Premium adjustments ................................ ...... 12 \n\n', '§ 9 Obligations ................................ ......................... 12 \n\n', '§ 10 Consequence

In [5]:
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import pickle

class FAISSProcessor:
    def __init__(self, chunks, chunk_metadata, model_name="all-MiniLM-L6-v2"):
        self.chunks = chunks
        self.chunk_metadata = chunk_metadata
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        
    def generate_embeddings(self):
        """Generate embeddings for the text chunks using the SentenceTransformer model"""
        print("Generating embeddings for the chunks...")
        embeddings = self.model.encode(self.chunks, convert_to_tensor=False)
        return embeddings
    
    def build_faiss_index(self, embeddings):
        """Build a FAISS index from the embeddings"""
        print("Building FAISS index...")

        # Normalize embeddings
        embeddings = np.array(embeddings, dtype=np.float32)
        faiss.normalize_L2(embeddings)  # Normalize for better retrieval performance
        # Create a FAISS index for dense vectors
        dimension = embeddings.shape[1]  # The number of dimensions in the embeddings
        index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity search
        index.add(np.array(embeddings, dtype=np.float32))  # Add the embeddings to the index
        return index
    
    def save_faiss_index(self, index, filename="faiss_index.index"):
        """Save the FAISS index to disk for later use"""
        print(f"Saving FAISS index to {filename}...")
        faiss.write_index(index, filename)
    
    def save_chunk_metadata(self, filename="chunk_metadata.pkl"):
        """Save chunk metadata for later reference"""
        print(f"Saving chunk metadata to {filename}...")
        with open(filename, 'wb') as f:
            pickle.dump(self.chunk_metadata, f)

    def save_chunks(self, filename="chunk.pkl"):
        """Save chunk metadata for later reference"""
        print(f"Saving chunk metadata to {filename}...")
        with open(filename, 'wb') as f:
            pickle.dump(self.chunks, f)
    
    def process(self):
        """Main pipeline to generate embeddings, build and save the FAISS index"""
        # Generate embeddings for chunks
        embeddings = self.generate_embeddings()
        
        # Build the FAISS index
        index = self.build_faiss_index(embeddings)
        
        # Save the FAISS index
        self.save_faiss_index(index)
        
        # Save chunk metadata for later use
        self.save_chunk_metadata()

        # Save chunk  for later use
        self.save_chunks()

class BM25Processor:
    def __init__(self, chunks):
        self.chunks = chunks
        self.tokenized_chunks = [chunk.split() for chunk in self.chunks]
        self.bm25 = BM25Okapi(self.tokenized_chunks)
    
    def save_bm25(self, filename="bm25_index.pkl"):
        print(f"Saving BM25 index to {filename}...")
        with open(filename, 'wb') as f:
            pickle.dump(self.bm25, f)
    
    def process(self):
        self.save_bm25()



# Example usage:
def main():
    faiss_processor = FAISSProcessor(chunks, chunk_metadata)
    faiss_processor.process()
    
    bm25_processor = BM25Processor(chunks)
    bm25_processor.process()

if __name__ == "__main__":
    main()


Generating embeddings for the chunks...
Building FAISS index...
Saving FAISS index to faiss_index.index...
Saving chunk metadata to chunk_metadata.pkl...
Saving chunk metadata to chunk.pkl...
Saving BM25 index to bm25_index.pkl...
