In [10]:
# Step 1: Import dependencies
from pypdf import PdfReader
from docx import Document
import os
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [11]:
def load_file(file_path: str) -> str:
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        reader = PdfReader(file_path)
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    
    elif ext == '.txt' or ext == '.md':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
        
    elif ext == '.docx':
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    
    else:
        raise ValueError(f"Unsupported file type: {ext}")


In [12]:
def chunk_text(text, chunk_size=500, overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return splitter.split_text(text)


In [13]:
file_path = "BAJHLIP23020V012223.pdf"  # change this to your file path
raw_text = load_file(file_path)
chunks = chunk_text(raw_text)

for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*80}")

Chunk 1:
UIN- BAJHLIP23020V012223                                 Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance Co. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 
For more details, log on to: www.bajajallianz.com | E-mail: bagichelp@bajajallianz.co.in or 
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (Toll Free No.) 
Issuing Office: 
 
GLOBAL HEALTH CARE 
 
 
Policy Wordings
--------------------------------------------------------------------------------
Chunk 2:
Issuing Office: 
 
GLOBAL HEALTH CARE 
 
 
Policy Wordings 
 
UIN- BAJHLIP23020V012223 
SECTION A) PREAMBLE 
 
Whereas the Insured described in the Policy Schedule hereto (hereinafter called the ‘Insured’  or “Policyholder” or 
“Insured Person”) has made to Bajaj Allianz General Insurance Company Limited (hereinafter called the “Company” 
or “Insurer” or “Insurance Company”) a proposal or Proposal as mentioned in the transcrip

In [15]:
print(f"Total Chunks: {len(chunks)}")

with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("✅ Saved chunks as chunks.pkl")

Total Chunks: 481
✅ Saved chunks as chunks.pkl
