In [4]:
!sudo apt update
!sudo apt install -y poppler-utils tesseract-ocr libtesseract-dev
!pip install pdf2image pytesseract pillow opencv-python tqdm




[33m0% [Working][0m            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
[33m0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.173.89)] [C[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.[0m                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
[33m0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)] [0m                                                                               Hit:4 https://cli.github.com/packages stable InRelease
[33m0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.[0m                                                                               Hit:5 https://cloud.r-project.org/bin/linux/ub

In [6]:
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
import pytesseract
from pytesseract import Output
import cv2
import json
from datetime import datetime
from tqdm import tqdm

# Preprocess image for better OCR
def preprocess_pil_image(pil_img, upscale_width=1600):
    img = np.array(pil_img.convert("RGB"))
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    h, w = gray.shape
    if w < upscale_width:
        scale = upscale_width / w
        gray = cv2.resize(gray, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
    gray = cv2.bilateralFilter(gray, 9, 75, 75)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 15, 9)
    return thresh

# Extract lines from image
def ocr_lines_from_image(image_for_ocr):
    if len(image_for_ocr.shape) == 2:
        ocr_input = cv2.cvtColor(image_for_ocr, cv2.COLOR_GRAY2RGB)
    else:
        ocr_input = image_for_ocr
    data = pytesseract.image_to_data(ocr_input, lang='eng', output_type=Output.DICT)
    lines = {}
    n = len(data['text'])
    for i in range(n):
        text = data['text'][i].strip()
        if text == "":
            continue
        key = (data['block_num'][i], data['par_num'][i], data['line_num'][i])
        if key not in lines:
            lines[key] = {'words': [], 'heights': [], 'tops': []}
        lines[key]['words'].append(text)
        lines[key]['heights'].append(int(data['height'][i]))
        lines[key]['tops'].append(int(data['top'][i]))
    out_lines = []
    for key, info in lines.items():
        text_line = " ".join(info['words']).strip()
        avg_height = int(np.median(info['heights']))
        top = int(np.min(info['tops']))
        out_lines.append({'text': text_line, 'height': avg_height, 'top': top})
    return out_lines

# Filter headline candidates
def select_headline_candidates(lines, min_words=3, max_words=20):
    candidates = []
    for l in lines:
        wcount = len(l['text'].split())
        if wcount < min_words or wcount > max_words:
            continue
        txt_lower = l['text'].lower()
        if any(tok in txt_lower for tok in ['advert', 'page', 'edition']):
            continue
        if l['text'].strip().isdigit():
            continue
        candidates.append(l)
    if not candidates:
        return []
    heights = [c['height'] for c in candidates]
    median_h = int(np.median(heights))
    strong = [c for c in candidates if c['height'] >= max(median_h, 18)]
    if not strong:
        strong = candidates
    strong_sorted = sorted(strong, key=lambda x: x['top'])
    final = []
    seen = set()
    for s in strong_sorted:
        t = s['text'].strip()
        if t in seen:
            continue
        seen.add(t)
        final.append(t)
    return final

# Main PDF processing
def process_pdf(pdf_path, pages_to_process=[1]):
    images = convert_from_path(pdf_path, dpi=300, fmt='jpeg',
                               first_page=min(pages_to_process),
                               last_page=max(pages_to_process))
    results = []
    for idx, img in zip(pages_to_process, images):
        print(f"[+] Processing page {idx} ...")
        thresh = preprocess_pil_image(img)
        lines = ocr_lines_from_image(thresh)
        candidates = select_headline_candidates(lines)
        results.append({'page': idx, 'headlines': candidates})
    out = {
        'extracted_at': datetime.utcnow().isoformat() + "Z",
        'source_pdf': pdf_path,
        'pages': results
    }
    return out


In [7]:
from google.colab import files
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]  # Get uploaded file name



Saving The-Hindu-11-Aug-25.pdf to The-Hindu-11-Aug-25.pdf


In [10]:
result = process_pdf(pdf_filename, pages_to_process=[6])
# Process the entire PDF (all pages)
#result = process_pdf(pdf_filename)


[+] Processing page 6 ...


In [12]:

"""
from pdf2image import pdfinfo_from_path

# Get total pages in PDF
info = pdfinfo_from_path(pdf_filename)
total_pages = info["Pages"]

# Make list of all page numbers [1, 2, 3, ..., total_pages]
pages_list = list(range(1, total_pages + 1))

# Process all pages
result = process_pdf(pdf_filename, pages_to_process=pages_list)

# Save JSON
with open("headlines.json", "w") as f:
    json.dump(result, f, indent=2)

print(f"Extracted headlines from {total_pages} pages.")

"""

'\nfrom pdf2image import pdfinfo_from_path\n\n# Get total pages in PDF\ninfo = pdfinfo_from_path(pdf_filename)\ntotal_pages = info["Pages"]\n\n# Make list of all page numbers [1, 2, 3, ..., total_pages]\npages_list = list(range(1, total_pages + 1))\n\n# Process all pages\nresult = process_pdf(pdf_filename, pages_to_process=pages_list)\n\n# Save JSON\nwith open("headlines.json", "w") as f:\n    json.dump(result, f, indent=2)\n\nprint(f"Extracted headlines from {total_pages} pages.")\n\n'

In [13]:
with open("headlines.json", "w") as f:
    json.dump(result, f, indent=2)

files.download("headlines.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>