In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
ROOT_FOLDER = '/content/drive/MyDrive/Data Folder/KMRL'
OUTPUT_CSV = '/content/drive/MyDrive/all_extracted.csv'


In [3]:
!pip install PyMuPDF pillow pytesseract
!apt-get install tesseract-ocr


Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.4 pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [6]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'


In [7]:
import os
import csv
import io
from pathlib import Path
import pymupdf
from PIL import Image, ImageOps, ImageFilter
import pytesseract
import re
import unicodedata

In [8]:
def clean_text_english(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"[^A-Za-z0-9\s.,;:!?()'\-\"@%$&]", " ", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def chunk_text(text, max_length=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_length, len(text))
        chunks.append(text[start:end])
        start += max_length - overlap
    return chunks

In [9]:
def append_chunks_to_csv(chunks, output_csv, filename, department=None, page_number=None):
    file_exists = os.path.exists(output_csv)
    with open(output_csv, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['source_file','department','page','chunk_index','text'])
        if not file_exists:
            writer.writeheader()
        for idx, chunk in enumerate(chunks):
            writer.writerow({
                'source_file': filename,
                'department': department,
                'page': page_number,
                'chunk_index': idx,
                'text': chunk
            })

In [10]:
def process_pdf(file_path, department, output_csv):
    try:
        doc = pymupdf.open(file_path)
    except Exception as e:
        print(f"Failed to open {file_path}: {e}")
        return

    for page_number, page in enumerate(doc, start=1):
        raw_text = ""

        # Extract PDF text layer
        text_blocks = page.get_text("blocks")
        if text_blocks:
            for block in text_blocks:
                txt = block[4].strip()
                if txt:
                    raw_text += " " + txt

        # Extract images & OCR
        images = page.get_images(full=True)
        if images:
            for img_index, img in enumerate(images, start=1):
                xref = img[0]
                try:
                    img_data = doc.extract_image(xref)
                    image_bytes = img_data["image"]
                    image = Image.open(io.BytesIO(image_bytes))
                except Exception as e:
                    print(f"{file_path} Page {page_number} Image {img_index} extraction error: {e}")
                    continue

                # Preprocess image
                filtered = image.filter(ImageFilter.MedianFilter(size=3))
                gray = ImageOps.grayscale(filtered)
                scale = 300 / 72
                base_w = min(int(gray.width * scale), 2500)
                base_h = min(int(gray.height * scale), 2500)
                gray_resized = gray.resize((base_w, base_h), Image.LANCZOS)

                # OCR
                try:
                    ocr_text = pytesseract.image_to_string(gray_resized)
                    raw_text += " " + ocr_text
                except Exception as e:
                    print(f"{file_path} Page {page_number} Image {img_index} OCR error: {e}")
                    continue

        # Clean & chunk
        cleaned_text = clean_text_english(raw_text)
        if not cleaned_text:
            continue

        chunks = chunk_text(cleaned_text, max_length=1000, overlap=200)

        # Append to CSV
        append_chunks_to_csv(
            chunks,
            output_csv,
            filename=os.path.basename(file_path),
            department=department,
            page_number=page_number
        )
        print(f"{file_path} Page {page_number}: {len(chunks)} chunks written.")


In [11]:
# -------------------------------
# 5. Traverse KMRL folder and process PDFs
# -------------------------------
def traverse_and_process(root_folder, output_csv, dept_list=None):
    root_path = Path(root_folder)
    for dept_folder in root_path.iterdir():
        if dept_folder.is_dir():
            department = dept_folder.name
            if dept_list and department not in dept_list:
                continue  # skip non-listed departments
            print(f"\nProcessing Department: {department}")
            for pdf_file in dept_folder.glob("*.pdf"):
                print(f"Processing PDF: {pdf_file}")
                process_pdf(pdf_file, department, output_csv)

In [12]:
ROOT_FOLDER = '/content/drive/MyDrive/Data Folder/KMRL'   # KMRL folder in Drive
OUTPUT_CSV = '/content/drive/MyDrive/all_extracted.csv'
DEPARTMENTS_TO_PROCESS = None

traverse_and_process(ROOT_FOLDER, OUTPUT_CSV, DEPARTMENTS_TO_PROCESS)
print(f"\nAll done! CSV saved to {OUTPUT_CSV}")


Processing Department: Finance
Processing PDF: /content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 1: 2 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 2: 3 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 3: 2 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 4: 4 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 5: 3 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 6: 3 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metro-Rail-Ltd_122025123422432.pdf Page 7: 2 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Finance/Kochi-Metr



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 35: 3 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 36: 4 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 37: 5 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 38: 4 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 39: 2 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 40: 4 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-Improvement-Survey.pdf Page 41: 3 chunks written.
/content/drive/MyDrive/Data Folder/KMRL/Operations/Kochi-Metro-Ridership-I

KeyboardInterrupt: 