In [1]:
import os
import pytesseract
import fitz  # PyMuPDF
import pdfplumber
import subprocess
from pathlib import Path
from unstructured.partition.pdf import partition_pdf
from pdf2image import convert_from_path
from tempfile import TemporaryDirectory
import pandas as pd
import multiprocessing
import json


def is_scanned_pdf(pdf_path: str, sample_pages=2) -> bool:
    try:
        elements = partition_pdf(filename=pdf_path, strategy="fast", max_pages=sample_pages)
        text = "\n".join([el.text for el in elements if el.text])
        return len(text.strip()) < 50
    except Exception as e:
        print(f"[!] Unstructured failed to parse {pdf_path}: {e}")
        return True


def run_ocrmypdf(input_path: str, output_path: str):
    try:
        subprocess.run(["ocrmypdf", "--force-ocr", input_path, output_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"[!] ocrmypdf failed: {e}")
        return False


def extract_tables_from_ocr_pdf(pdf_path):
    tables = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                for table in page.extract_tables():
                    if table and any(cell for row in table for cell in row if cell and cell.strip()):
                        df = pd.DataFrame(table)
                        tables.append((i + 1, df))
    except Exception as e:
        print(f"[!] pdfplumber failed: {e}")
    return tables


def extract_text_from_native_pdf(pdf_path: str) -> str:
    elements = partition_pdf(filename=pdf_path, strategy="fast")
    return "\n".join([el.text for el in elements if el.text])


def extract_text_from_ocr_images(pdf_path):
    text = ""
    with TemporaryDirectory() as tempdir:
        images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
        for i, image in enumerate(images):
            ocr_result = pytesseract.image_to_string(image, config="--psm 6")
            text += f"\n\n--- Page {i+1} ---\n\n" + ocr_result
    return text


def get_pdf_metadata(pdf_path: str):
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata or {}
        metadata["page_count"] = len(doc)
        metadata["file_path"] = str(pdf_path)
        metadata["file_name"] = Path(pdf_path).name
        return metadata
    except Exception as e:
        print(f"[!] Failed to read metadata for {pdf_path}: {e}")
        return {"file_path": pdf_path}


def hybrid_pdf_processor(args):
    pdf_path, input_root, output_root = args
    relative_path = os.path.relpath(pdf_path, input_root)
    output_txt_path = os.path.join(output_root, Path(relative_path).with_suffix(".txt"))
    output_csv_dir = os.path.join(output_root, Path(relative_path).parent)
    output_meta_path = os.path.join(output_root, Path(relative_path).with_suffix(".meta.json"))

    os.makedirs(os.path.dirname(output_txt_path), exist_ok=True)

    result_text = ""
    try:
        if is_scanned_pdf(pdf_path):
            print(f"[OCR] Detected scanned PDF: {pdf_path}")
            ocr_output_pdf = pdf_path.replace(".pdf", "_ocr.pdf")
            if run_ocrmypdf(pdf_path, ocr_output_pdf):
                tables = extract_tables_from_ocr_pdf(ocr_output_pdf)
                if tables:
                    result_text = "[Extracted Tables]\n"
                    for page_num, df in tables:
                        result_text += f"\n\n--- Page {page_num} ---\n{df.to_csv(index=False)}"
                        csv_path = os.path.join(output_csv_dir, f"{Path(pdf_path).stem}_page{page_num}.csv")
                        df.to_csv(csv_path, index=False)
                else:
                    result_text = extract_text_from_ocr_images(pdf_path)
            else:
                result_text = extract_text_from_ocr_images(pdf_path)
        else:
            print(f"[Text] Detected native text PDF: {pdf_path}")
            result_text = extract_text_from_native_pdf(pdf_path)

        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(result_text)

        metadata = get_pdf_metadata(pdf_path)
        with open(output_meta_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2)

        print(f"[✓] Processed: {pdf_path}")
    except Exception as e:
        print(f"[✗] Failed to process {pdf_path}: {e}")


from tqdm.contrib.concurrent import process_map  # Drop-in parallel map with progress bar
from multiprocessing import Pool
from tqdm import tqdm
import multiprocessing
def batch_process_pdfs_with_multiprocessing(input_root: str, output_root: str, num_workers=4):
    pdf_paths = []
    for dirpath, _, filenames in os.walk(input_root):
        for file in filenames:
            if file.lower().endswith(".pdf"):
                pdf_paths.append(os.path.join(dirpath, file))

    args_list = [(pdf, input_root, output_root) for pdf in pdf_paths]

    with Pool(processes=num_workers) as pool:
        with tqdm(total=len(args_list), desc="Processing PDFs") as pbar:
            for _ in pool.imap_unordered(hybrid_pdf_processor, args_list):
                pbar.update()


    



In [None]:

input_pdf_root = "/home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001"  # This folder contains nested folders
output_text_root = "/home/saranshvashistha/workspace/AIML-018-IITI-SoC/text_op"

batch_process_pdfs_with_multiprocessing(input_pdf_root, output_text_root, num_workers=32)


Processing PDFs:   0%|          | 0/1302 [00:00<?, ?it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/IITIPAN.pdf
[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/IITIPAN.pdf
[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/IITIPAN.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_ebsb.iiti.ac.in_/downloads/documents/ebsb.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Mission_Vision_Objectives.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_ebsb.iiti.ac.in_/downloads/documents/ebsb.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_ugforms.php/downloads/documents/04072014Form-for-Selecting-BTP-Project.pdf


Processing PDFs:   0%|          | 1/1302 [00:03<1:07:32,  3.12s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:   0%|          | 2/1302 [00:03<33:21,  1.54s/it]  

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_ugforms.php/downloads/documents/04072014Form-for-Selecting-BTP-Project.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Corrigendum-Notification.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2020-21-Academic_Calendar - BTech 1st Year as on 3 March 2021.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Mission_Vision_Objectives.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2020-Autumn_AY-2020-21_Academic Calendar.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Corrigendum-Notification.pdf


Processing PDFs:   0%|          | 4/1302 [00:04<20:16,  1.07it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Mission_Vision_Objectives.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2020-21-Academic_Calendar - BTech 1st Year as on 3 March 2021.pdf


Processing PDFs:   0%|          | 6/1302 [00:04<11:23,  1.90it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Notification - Guidelines for utilization of Research grant Contingency of PMRF students - IITI (1).pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:   1%|          | 7/1302 [00:05<10:44,  2.01it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Final_20Draft_20Advtt._20September_202024_20-_20Hindi1.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Academic_Calendar_AY 2021-22.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2020-Autumn_AY-2020-21_Academic Calendar.pdf


Processing PDFs:   1%|          | 8/1302 [00:05<10:47,  2.00it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2018-19-Academic_Calendar - 02-07-18.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Answer books retention rules.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-0

Processing PDFs:   1%|          | 9/1302 [00:06<10:25,  2.07it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2016-17-Academic_Calender.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2017-18-Academic_Calender.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2018-19-Academic_Calendar - 02-07-18.pdf


Processing PDFs:   1%|          | 10/1302 [00:07<12:50,  1.68it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Guidelines for financial support for attending and presenting research papers in international and national conferences.pdf[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf



Start processing 3 pages concurrently
Processing PDFs:   1%|          | 11/1302 [00:07<10:01,  2.15it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_ugforms.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Guidelines for financial support for attending and presenting research papers in international and national conferences.pdf

Processing PDFs:   1%|          | 12/1302 [00:07<07:59,  2.69it/s]


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Lw7kmBlBEmhiLGKFet0t.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf

Processing PDFs:   1%|          | 13/1302 [00:07<06:48,  3.15it/s]


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Academic_Calendar_AY 2021-22.pdf


Processing PDFs:   1%|          | 15/1302 [00:07<05:15,  4.08it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Final_20Draft_20Advtt._20September_202024_20-_20Hindi1.pdf


Processing PDFs:   1%|          | 16/1302 [00:08<04:53,  4.38it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Final_20Draft_20Advtt._20November_202024_20-_20School_20of_20Innovation_20-_20Hindi..pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   1%|▏         | 17/1302 [00:08<04:25,  4.84it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Revised 2015-16-Academic_Calender.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2022-23_Academic_Calendar - Final - 23-06-2022.pdf[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf



Processing PDFs:   1%|▏         | 18/1302 [00:08<04:30,  4.74it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/14052013_2013-14_Academic_Calender.pdf[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Academic_Calendar-2020-21.pdf

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Revised_2023-24_Academic_Calendar - 05-07-2023.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2016-17-Academic_Calender.pdf


Processing PDFs:   1%|▏         | 19/1302 [00:08<03:59,  5.37it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/200620142014-15-Academic_Calender.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   2%|▏         | 20/1302 [00:09<06:09,  3.47it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2017-18-Academic_Calender.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Revised 2015-16-Academic_Calender.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Academic_Calendar-2020-21.pdf

Processing PDFs:   2%|▏         | 22/1302 [00:09<04:04,  5.23it/s]


[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_tpc/downloads/documents/IITIPAN.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   2%|▏         | 24/1302 [00:09<04:11,  5.09it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/FFE Notice_for_IIT.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/9july2013Mess Rules.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/14052013_2013-14_Academic_Calender.pdf[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/VS5yf5CIVpIITNt7qyZW.pdf



Processing PDFs:   2%|▏         | 25/1302 [00:09<04:53,  4.35it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/FFE Notice_for_IIT.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/200620142014-15-Academic_Calender.pdf


Processing PDFs:   2%|▏         | 27/1302 [00:10<03:41,  5.76it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_otherscholarship.php/downloads/documents/Answer books retention rules.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_otherscholarship.php/downloads/documents/Answer books retention rules.pdf


Processing PDFs:   2%|▏         | 29/1302 [00:10<03:23,  6.26it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2019-20-Academic_Calendar - IIT INDORE.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/9july2013Mess Rules.pdf


Processing PDFs:   2%|▏         | 30/1302 [00:10<03:48,  5.58it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/04abe9e1b272abb2df813eea2c44adbf.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2021-22-Academic_Calendar - BTech 1st Year(2).pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2022-23_Academic_Calendar - BTech 1St Year - Revised.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Hindi_20Shabdavali.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI

Processing PDFs:   2%|▏         | 31/1302 [00:11<08:22,  2.53it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20201021 Policy for Appointment of Visiting Faculty Members.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20201116 Policy for Appointment of Adjunct Faculty Members.pdf


Processing PDFs:   2%|▏         | 32/1302 [00:11<06:53,  3.07it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2021-22-Academic_Calendar - BTech 1st Year(2).pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf
[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/leave_Approving Authority.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_Leave.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/data

Processing PDFs:   3%|▎         | 34/1302 [00:12<06:38,  3.18it/s]    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20201021 Policy for Appointment of Visiting Faculty Members.pdf


Processing PDFs:   3%|▎         | 35/1302 [00:12<06:18,  3.35it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2022-23_Academic_Calendar - BTech 1St Year - Revised.pdf[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/Revised_2023-24_Academic_Calendar - 05-07-2023.pdf[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_LTC.pdf


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2019-20-Academic_Calendar - IIT INDORE.pdf

Processing PDFs:   3%|▎         | 36/1302 [00:13<06:03,  3.48it/s]


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2020-21-Academic_Calendar- for Existing Students as on 26 Nov. 2020 - Final.pdf
[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_Reservation.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20201013 Policy for Appointment of Regular Faculty Members.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_PwD.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T1

Processing PDFs:   3%|▎         | 40/1302 [00:13<04:55,  4.28it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_Sexual Harrassment.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_LTC.pdf


Processing PDFs:   3%|▎         | 41/1302 [00:14<04:55,  4.26it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_PwD.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Lw7kmBlBEmhiLGKFet0t.pdf

Processing PDFs:   3%|▎         | 42/1302 [00:14<05:06,  4.10it/s]


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_EWS.pdf


Start processing 6 pages concurrently
    6 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/20201116 Policy for Appointment of Adjunct Faculty Members.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/VS5yf5CIVpIITNt7qyZW.pdf


Processing PDFs:   3%|▎         | 44/1302 [00:15<07:06,  2.95it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/leave_Approving Authority.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/20201116 Policy for Appointment of Adjunct Faculty Members.pdf


Processing PDFs:   3%|▎         | 45/1302 [00:15<06:36,  3.17it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_Sexual Harrassment.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_CEA.pdf


Start processing 13 pages concurrently


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_EWS.pdf


Processing PDFs:   4%|▎         | 47/1302 [00:16<06:51,  3.05it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/FAQ_PwD.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/FAQ_PwD.pdf


Processing PDFs:   4%|▎         | 48/1302 [00:16<07:17,  2.87it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Awzw0GNiyEfYB76zfU7I.pdf


Processing PDFs:   4%|▍         | 49/1302 [00:17<07:24,  2.82it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/FAQ_Sexual Harrassment.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_ugforms.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf


Processing PDFs:   4%|▍         | 50/1302 [00:17<07:37,  2.74it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/FAQ_Sexual Harrassment.pdf


Processing PDFs:   4%|▍         | 51/1302 [00:17<06:23,  3.26it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_page_about-us/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf


Processing PDFs:   4%|▍         | 52/1302 [00:17<06:40,  3.12it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Antiragging_Dr_RK_Raghvan.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20201013 Policy for Appointment of Regular Faculty Members.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Notice for DUGC and DPGC as on 11-Dec-2020-Updated.pdf

Processing PDFs:   4%|▍         | 53/1302 [00:18<05:47,  3.59it/s]


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   4%|▍         | 54/1302 [00:19<10:11,  2.04it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/FAQ_CEA.pdf

Processing PDFs:   4%|▍         | 55/1302 [00:19<11:14,  1.85it/s]


[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Notification - Guidelines for utilization of Research grant Contingency of PMRF students - IITI (1).pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Notice for DUGC and DPGC as on 11-Dec-2020-Updated.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Guidelines for financial support for attending and pr

Processing PDFs:   4%|▍         | 58/1302 [00:20<06:03,  3.43it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Guidelines for financial support for attending and presenting research papers in international and national conferences.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf


Processing PDFs:   5%|▍         | 60/1302 [00:20<04:37,  4.48it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2024-25_Academic Calendar_Updated - 29-1-2025.pdf


Processing PDFs:   5%|▍         | 61/1302 [00:20<06:29,  3.18it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-SPGC+SUGC.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   5%|▍         | 62/1302 [00:22<10:24,  1.99it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2021-Constitution of SEC (MCM and others).pdf[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-SPGC+SUGC.pdf

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/9july2013Mess Rules.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_page_about-us/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf

Processing PDFs:   5%|▍         | 64/1302 [00:22<08:15,  2.50it/s]


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/9july2013Mess Rules.pdf


Start processing 27 pages concurrently
Processing PDFs:   5%|▍         | 65/1302 [00:23<08:34,  2.41it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/FFE Notice_for_IIT.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_tpc/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/FFE Notice_for_IIT.pdf


Processing PDFs:   5%|▌         | 66/1302 [00:23<09:41,  2.13it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/Notice for Anti Ragging and Anti Harassment Committee - 2019.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-Organization-Structure-for-Academic-Issues-Oct2016.pdf


Start processing 3 pages concurrently


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   5%|▌         | 67/1302 [00:25<14:55,  1.38it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_hindi_former_directors/downloads/documents/IITIPAN.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-Organization-Structure-for-Academic-Issues-Oct2016.pdf


Processing PDFs:   5%|▌         | 68/1302 [00:25<14:54,  1.38it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-25_Academic Calendar_Updated - 29-1-2025.pdf

Processing PDFs:   5%|▌         | 69/1302 [00:28<24:22,  1.19s/it]



[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_page_about-us/downloads/documents/Hindi_20Shabdavali.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_hindi_former_directors/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_hindi_former_directors/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   5%|▌         | 70/1302 [00:30<28:40,  1.40s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/IT ACT, 1961 including amendments.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf


Processing PDFs:   5%|▌         | 71/1302 [00:34<42:25,  2.07s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf


Processing PDFs:   6%|▌         | 72/1302 [00:35<40:50,  1.99s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-25_Academic Calendar_Updated - 29-1-2025.pdf


Processing PDFs:   6%|▌         | 73/1302 [00:38<43:46,  2.14s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_page_about-us/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   6%|▌         | 74/1302 [00:38<34:06,  1.67s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Instructions for Students.pdf


    1    **** Error: stream operator isn't terminated by valid EOL.
               Output may be incorrect.
   **** Error: stream operator isn't terminated by valid EOL.
               Output may be incorrect.



[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_tpc/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   6%|▌         | 75/1302 [00:40<32:09,  1.57s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf


Processing PDFs:   6%|▌         | 76/1302 [00:41<30:56,  1.51s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Instructions for Students.pdf


Processing PDFs:   6%|▌         | 77/1302 [00:41<23:47,  1.17s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Notification - Guidelines for utilization of Research grant Contingency of PMRF students - IITI (1).pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-25_Academic Calendar_Updated - 29-1-2025.pdf


Start processing 3 pages concurrently


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/IIT-Indore-Coding_Scheme-for-PG+PhD-Courses.pdf


Processing PDFs:   6%|▌         | 78/1302 [00:45<41:28,  2.03s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Answer books retention rules.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Answer books retention rules.pdf


Processing PDFs:   6%|▌         | 79/1302 [00:46<31:53,  1.56s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Income Certifivcate Issuing Authority in various states or Union Terrirories .pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf


    1    **** Error: stream operator isn't terminated by valid EOL.
               Output may be incorrect.
   **** Error: stream operator isn't terminated by valid EOL.
               Output may be incorrect.



[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf


Processing PDFs:   6%|▌         | 80/1302 [00:49<43:33,  2.14s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Guidelines for financial support for attending and presenting research papers in international and national conferences.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Guidelines for financial support for attending and presenting research papers in international and national conferences.pdf


Processing PDFs:   6%|▌         | 81/1302 [00:50<32:09,  1.58s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/27062014CSIR-Female-RA-maternity-leave.pdf


Start processing 2 pages concurrently
Processing PDFs:   6%|▋         | 82/1302 [00:50<24:59,  1.23s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/IT ACT, 1961 including amendments.pdf


Processing PDFs:   6%|▋         | 83/1302 [00:52<27:49,  1.37s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/9july2013Mess Rules.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/9july2013Mess Rules.pdf


Processing PDFs:   6%|▋         | 84/1302 [00:52<23:21,  1.15s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-25_Academic Calendar_Updated - 29-1-2025.pdf


Processing PDFs:   7%|▋         | 85/1302 [00:53<19:18,  1.05it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/NSP_SOP for CSS Scholarship.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/FFE Notice_for_IIT.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/FFE Notice_for_IIT.pdf


Processing PDFs:   7%|▋         | 86/1302 [00:54<20:00,  1.01it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/IITIPAN.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf


Processing PDFs:   7%|▋         | 87/1302 [00:55<19:26,  1.04it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf


    1 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20230406_Faculty Handbook_IITI_6april2023.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/17052013_Preparatory-Courses-of-Studies-2013(May).pdf


Processing PDFs:   7%|▋         | 88/1302 [01:00<45:04,  2.23s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   7%|▋         | 89/1302 [01:01<34:07,  1.69s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Mission_Vision_Objectives.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-25_Academic Calendar_2024 BTech and Preparatory batch - 29-01-2025.pdf


Processing PDFs:   7%|▋         | 90/1302 [01:02<35:41,  1.77s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_library.iiti.ac.in_page_id=37/downloads/documents/OPAC-Guide.pdf


    6 [tesseract] lots of diacritics - possibly poor OCR


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_library.iiti.ac.in_page_id=37/downloads/documents/OPAC-Guide.pdf


Processing PDFs:   7%|▋         | 91/1302 [01:04<31:28,  1.56s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:   7%|▋         | 92/1302 [01:04<24:34,  1.22s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/IITIPAN.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_library.iiti.ac.in_page_id=37/downloads/documents/My_Account_Koha_OPAC.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf

Processing PDFs:   7%|▋         | 93/1302 [01:05<22:01,  1.09s/it]


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_library.iiti.ac.in_page_id=37/downloads/documents/My_Account_Koha_OPAC.pdf


Processing PDFs:   7%|▋         | 94/1302 [01:05<16:05,  1.25it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/NSP_SOP for CSS Scholarship.pdf


Processing PDFs:   7%|▋         | 95/1302 [01:06<20:48,  1.03s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


    1 page already has text! - rasterizing text and running OCR anyway
Processing PDFs:   7%|▋         | 96/1302 [01:07<19:00,  1.06it/s]


[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/IITIPAN_ocr.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:   7%|▋         | 97/1302 [01:08<18:22,  1.09it/s]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Mission_Vision_Objectives.pdf


    1 page already has text! - rasterizing text and running OCR anyway
Processing PDFs:   8%|▊         | 98/1302 [01:09<15:41,  1.28it/s]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_office_of_registrar/downloads/documents/IITIPAN.pdf


Postprocessing...


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   8%|▊         | 99/1302 [01:13<35:44,  1.78s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_office_of_registrar/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/IITI Statutes 04102017.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_office_of_registrar/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   8%|▊         | 100/1302 [01:17<50:04,  2.50s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/IITIPAN.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-UG Rules & Policy Document dated 09.01.2024.pdf


Processing PDFs:   8%|▊         | 101/1302 [01:18<39:42,  1.98s/it]    1 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Hindi_20Shabdavali.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   8%|▊         | 102/1302 [01:26<1:16:53,  3.84s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-PG-PhD-Rules+Regulations Dated 09.01.2025.pdf


Processing PDFs:   8%|▊         | 103/1302 [01:27<1:00:41,  3.04s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:   8%|▊         | 104/1302 [01:29<53:55,  2.70s/it]  

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/Hindi_20Shabdavali.pdf[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Mission_Vision_Objectives.pdf



Processing PDFs:   8%|▊         | 105/1302 [01:29<40:43,  2.04s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/20230406_Faculty Handbook_IITI_6april2023.pdf


Processing PDFs:   8%|▊         | 107/1302 [01:35<46:07,  2.32s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:   8%|▊         | 108/1302 [01:36<42:08,  2.12s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:   8%|▊         | 109/1302 [01:37<35:14,  1.77s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/MDPAI3W8pefUUaGF7VKL.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/MDPAI3W8pefUUaGF7VKL.pdf


Processing PDFs:   8%|▊         | 110/1302 [01:38<28:53,  1.45s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:   9%|▊         | 111/1302 [01:38<22:20,  1.13s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/GbtJoWPUFZDUgRuOioMp.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/GbtJoWPUFZDUgRuOioMp.pdf


Processing PDFs:   9%|▊         | 112/1302 [01:38<17:27,  1.14it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/IITI EMPANELLED HOSPITALS.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/IITI EMPANELLED HOSPITALS.pdf


Processing PDFs:   9%|▊         | 113/1302 [01:39<17:01,  1.16it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/details_ghi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/details_ghi.pdf


Processing PDFs:   9%|▉         | 114/1302 [01:40<19:38,  1.01it/s]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/Information for submission of the form for Best All-Round Performance Award 2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/Information for submission of the form for Best All-Round Performance Award 2024.pdf


Processing PDFs:   9%|▉         | 115/1302 [01:42<24:44,  1.25s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Hindi_20Shabdavali.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/2024 Nomination form for Best All-rounder.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   9%|▉         | 116/1302 [01:50<1:01:19,  3.10s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Incubation Policy updated Final Jan 2023.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_studentaffairs.iiti.ac.in_people.php/downloads/documents/2024 Nomination form for Best All-rounder.pdf


Processing PDFs:   9%|▉         | 117/1302 [01:57<1:27:53,  4.45s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/IITI ACE September Brochure.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:   9%|▉         | 118/1302 [01:58<1:06:42,  3.38s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Incubation Policy updated Final Jan 2023.pdf


Processing PDFs:   9%|▉         | 119/1302 [01:58<47:55,  2.43s/it]  

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Seed Fund - Startup Application Guide (2).pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Seed Fund - Startup Application Guide (2).pdf


Processing PDFs:   9%|▉         | 120/1302 [02:00<41:58,  2.13s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CorMakers.PDF
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/IITI Statutes 04102017.pdf


Processing PDFs:   9%|▉         | 121/1302 [02:02<40:26,  2.05s/it]Start processing 4 pages concurrently


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Guidelines-Startup India Seed Fund Scheme.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Food_outlet.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_acefoundation.iiti.ac.in_/downloads/documents/Guidelines-Startup India Seed Fund Scheme.pdf


Processing PDFs:   9%|▉         | 122/1302 [02:08<1:05:52,  3.35s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Xerox_20Scan_28052025161143.PDF
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Food_outlet.pdf


Processing PDFs:   9%|▉         | 123/1302 [02:09<50:18,  2.56s/it]  

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/005-Prebid.PDF


Start processing 2 pages concurrently
Optimize ratio: 1.06 savings: 5.4%
Output file is a PDF/A-2B (as expected)
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.49 savings: 33.0%
Output file is a PDF/A-2B (as expected)
Postprocessing...
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.42 savings: 29.8%
Output file is a PDF/A-2B (as expected)
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.42 savings: 29.8%
Output file is a PDF/A-2B (as expected)
Optimize ratio: 1.42 savings: 29.8%
The output file siz

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/IITIPAN_ocr.pdf


Processing PDFs:  10%|▉         | 124/1302 [03:52<10:42:46, 32.74s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/IITIPAN.pdf


    2 [tesseract] took too long to OCR - skipping
    1 [tesseract] took too long to OCR - skipping
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.28 savings: 21.7%
Output file is a PDF/A-2B (as expected)
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
    3 [tesseract] took too long to OCR - skipping
The output file size is 4.94× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)

Postprocessing...
    1 page already has text! - rasterizing text and running OCR anyway
Optimize ratio: 1.00 savings: 0.3%
Output file is a PDF/A-2B (as expected)
Optimize ratio: 1.29 savings: 22.6%
Output file is a PDF/A-2B (as expected)
Some input metadata could

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf


The output file size is 1.84× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)



[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_tpc/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|▉         | 125/1302 [10:26<46:06:13, 141.01s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/PrebidCFDCOutlet02.PDF
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_hindi_former_directors/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|▉         | 126/1302 [11:29<38:26:55, 117.70s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/10-Pre-Bid.PDF
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_former-heads-of-ee/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|▉         | 127/1302 [12:13<31:09:28, 95.46s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/D&CCPPP.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/D&CCPPP.pdf


Processing PDFs:  10%|▉         | 128/1302 [12:27<23:10:40, 71.07s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  10%|▉         | 129/1302 [12:38<17:16:18, 53.01s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Xerox_20Scan_28022025160039.PDF
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf


Start processing 2 pages concurrently


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_office_of_registrar/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|▉         | 130/1302 [13:24<16:37:09, 51.05s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/3PfoVX2K6id2Bl17ONJC.PDF


Start processing 32 pages concurrently
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.24 savings: 19.0%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_faculty-positions/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|█         | 131/1302 [13:47<13:52:40, 42.67s/it]   17 [tesseract] lots of diacritics - possibly poor OCR


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:  10%|█         | 132/1302 [13:58<10:46:46, 33.17s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  10%|█         | 133/1302 [14:03<8:01:13, 24.70s/it] 

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Jo1MYRc1JcBF0GXoFPCC.PDF


Start processing 30 pages concurrently
   20 [tesseract] lots of diacritics - possibly poor OCR
    8 [tesseract] lots of diacritics - possibly poor OCR
   18 [tesseract] lots of diacritics - possibly poor OCR
    3 [tesseract] lots of diacritics - possibly poor OCR
   13 [tesseract] lots of diacritics - possibly poor OCR
   15 [tesseract] lots of diacritics - possibly poor OCR
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Postprocessing...
Optimize ratio: 1.17 savings: 14.3%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_project-positions/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|█         | 134/1302 [14:56<10:43:20, 33.05s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/005-FireExti.PDF


   11 [tesseract] lots of diacritics - possibly poor OCR
Start processing 31 pages concurrently
    5 [tesseract] lots of diacritics - possibly poor OCR
   20 [tesseract] lots of diacritics - possibly poor OCR
    1 [tesseract] lots of diacritics - possibly poor OCR
Postprocessing...
    1 [tesseract] took too long to OCR - skipping
    2 [tesseract] took too long to OCR - skipping
Postprocessing...
   29 [tesseract] lots of diacritics - possibly poor OCR
Optimize ratio: 1.04 savings: 3.4%
Output file is a PDF/A-2B (as expected)
Optimize ratio: 1.16 savings: 13.5%
Output file is a PDF/A-2B (as expected)
   15 [tesseract] lots of diacritics - possibly poor OCR


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_page_dst-fist-center-of-excellence-in-gear-engineering/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|█         | 135/1302 [16:04<14:11:28, 43.78s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/OLUQBeqDUK7PN2kYMAd5.PDF


   14 [tesseract] lots of diacritics - possibly poor OCR
Start processing 18 pages concurrently
Postprocessing...
Postprocessing...
Optimize ratio: 1.04 savings: 3.5%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_former_directors/downloads/documents/IITIPAN.pdf


Processing PDFs:  10%|█         | 136/1302 [17:07<15:59:38, 49.38s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/OutletPrebid.PDF


Start processing 2 pages concurrently
Optimize ratio: 1.07 savings: 6.9%
Output file is a PDF/A-2B (as expected)
Optimize ratio: 1.06 savings: 5.9%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf


Processing PDFs:  11%|█         | 137/1302 [17:19<12:23:45, 38.31s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/010EmpanelHotel.PDF


Start processing 30 pages concurrently
   22 [tesseract] lots of diacritics - possibly poor OCR
   18 [tesseract] lots of diacritics - possibly poor OCR


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf


Processing PDFs:  11%|█         | 138/1302 [17:43<10:59:25, 33.99s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/VUqEO7ieT21aEjOtL7tP.pdf


   19 [tesseract] lots of diacritics - possibly poor OCR
   20 [tesseract] lots of diacritics - possibly poor OCR


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_center-for-indian-scientific-knowledge-systems/downloads/documents/IITIPAN.pdf


Processing PDFs:  11%|█         | 139/1302 [18:01<9:22:43, 29.03s/it] 

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/UpxnacxnCeL5xIDAknFy.PDF


Start processing 30 pages concurrently
   14 [tesseract] lots of diacritics - possibly poor OCR
   30 [tesseract] lots of diacritics - possibly poor OCR
   18 [tesseract] lots of diacritics - possibly poor OCR


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/VUqEO7ieT21aEjOtL7tP.pdf


Processing PDFs:  11%|█         | 140/1302 [18:17<8:06:07, 25.10s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Xerox_20Scan_13032025151501.PDF


   24 [tesseract] lots of diacritics - possibly poor OCR
   22 [tesseract] lots of diacritics - possibly poor OCR
   19 [tesseract] lots of diacritics - possibly poor OCR
   20 [tesseract] lots of diacritics - possibly poor OCR
   24 [tesseract] lots of diacritics - possibly poor OCR
   14 [tesseract] lots of diacritics - possibly poor OCR
   30 [tesseract] lots of diacritics - possibly poor OCR
Postprocessing...
Postprocessing...


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_administrative-staff-latest/downloads/documents/IITIPAN.pdf


Processing PDFs:  11%|█         | 141/1302 [19:01<9:54:41, 30.73s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/0VDtgn8X2yTgFMIC3vmJ.PDF


Start processing 32 pages concurrently
Optimize ratio: 1.00 savings: -0.4%
Image optimization did not improve the file - optimizations will not be used
Optimize ratio: 1.00 savings: -0.4%
Image optimization did not improve the file - optimizations will not be used
Output file is a PDF/A-2B (as expected)
Output file is a PDF/A-2B (as expected)
   32 [tesseract] lots of diacritics - possibly poor OCR
   17 [tesseract] lots of diacritics - possibly poor OCR
    1 [tesseract] took too long to OCR - skipping
    2 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.36 savings: 26.7%
Output file is a PDF/A-2B (as expected)
Postprocessing...
Optimize ratio: 1.04 savings: 3.7%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2024-July-PG-PhD-Curriculum+Syllabi-of-Courses 27.12.2024.pdf


Processing PDFs:  11%|█         | 142/1302 [21:35<21:51:13, 67.82s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/005-Corrigendum.PDF


    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.24 savings: 19.5%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/IITIPAN.pdf


Processing PDFs:  11%|█         | 143/1302 [21:56<17:21:02, 53.89s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CPPMaker.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CPPMaker.pdf


Processing PDFs:  11%|█         | 144/1302 [22:07<13:12:07, 41.04s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/DCPrebid.PDF


Start processing 4 pages concurrently
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.24 savings: 19.6%
Output file is a PDF/A-2B (as expected)
    3 [tesseract] took too long to OCR - skipping
    4 [tesseract] took too long to OCR - skipping
    2 [tesseract] took too long to OCR - skipping
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.15 savings: 13.3%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_academic_cal.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf


Processing PDFs:  11%|█         | 145/1302 [25:33<29:02:12, 90.35s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/t682Wee2sfSvLzRkDP1k.PDF


Start processing 32 pages concurrently


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf
[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Corrigendum1.pdf

Processing PDFs:  11%|█         | 146/1302 [26:14<24:18:28, 75.70s/it]




   45 [tesseract] lots of diacritics - possibly poor OCR
Postprocessing...


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2025-Jan-UG-Curriculum+Syllabi-of-Courses  dated 10.01.2024.pdf


Processing PDFs:  11%|█▏        | 147/1302 [27:30<24:16:17, 75.65s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CPPPCFDC02.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CPPPCFDC02.pdf


Processing PDFs:  11%|█▏        | 148/1302 [27:40<17:59:02, 56.10s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  11%|█▏        | 149/1302 [28:13<15:42:46, 49.06s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/YOaajKX5SxDAvod1WIQQ.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/YOaajKX5SxDAvod1WIQQ.pdf


Processing PDFs:  12%|█▏        | 150/1302 [28:15<11:10:05, 34.90s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nOwNBsIidTkMH32ysuMP.pdf


Optimize ratio: 1.04 savings: 3.5%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nOwNBsIidTkMH32ysuMP.pdf


Processing PDFs:  12%|█▏        | 151/1302 [28:19<8:10:14, 25.56s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/jkrqltCKNuwH6qX9p5Ds.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/jkrqltCKNuwH6qX9p5Ds.pdf


Processing PDFs:  12%|█▏        | 152/1302 [28:20<5:48:57, 18.21s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ReLAu5tiPQLoFheRbvPU.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ReLAu5tiPQLoFheRbvPU.pdf


Processing PDFs:  12%|█▏        | 153/1302 [28:21<4:13:55, 13.26s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/b3cFyvi78tbi34Rymw1r.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/b3cFyvi78tbi34Rymw1r.pdf


Processing PDFs:  12%|█▏        | 154/1302 [28:40<4:44:10, 14.85s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/syMBET4NTUXcqePgyrnt.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/syMBET4NTUXcqePgyrnt.pdf


Processing PDFs:  12%|█▏        | 155/1302 [28:43<3:35:11, 11.26s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wffkGgaFStKmSsBcM5H7.pdf


    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.22 savings: 18.1%
Output file is a PDF/A-2B (as expected)
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.49 savings: 33.0%
Output file is a PDF/A-2B (as expected)
The output file size is 4.67× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)



[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Xerox_20Scan_28052025161143.PDF


Processing PDFs:  12%|█▏        | 156/1302 [32:06<21:56:32, 68.93s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/bfWBpg2VTN85SlsXUTpE.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/bfWBpg2VTN85SlsXUTpE.pdf


Processing PDFs:  12%|█▏        | 157/1302 [32:13<15:57:48, 50.19s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/VqhZ3YlgrobrEP0Mhqrl.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/VqhZ3YlgrobrEP0Mhqrl.pdf


Processing PDFs:  12%|█▏        | 158/1302 [32:14<11:17:38, 35.54s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/OikX1a0dOiUW1p3pwe9E.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/OikX1a0dOiUW1p3pwe9E.pdf


Processing PDFs:  12%|█▏        | 159/1302 [32:19<8:19:20, 26.21s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/lFcxiKKWebf6pPhgZbio.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/lFcxiKKWebf6pPhgZbio.pdf


Processing PDFs:  12%|█▏        | 160/1302 [32:21<6:03:48, 19.11s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CfWcQ9MyBx6rdkzcJ8MH.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CfWcQ9MyBx6rdkzcJ8MH.pdf


Processing PDFs:  12%|█▏        | 161/1302 [32:22<4:18:53, 13.61s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/FSOyjnsp8xrSeBgW1GYR.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/FSOyjnsp8xrSeBgW1GYR.pdf


Processing PDFs:  12%|█▏        | 162/1302 [32:22<3:04:03,  9.69s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/fS8lpZSlYmVJcBUicOov.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/fS8lpZSlYmVJcBUicOov.pdf


Processing PDFs:  13%|█▎        | 163/1302 [32:25<2:22:04,  7.48s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KepmQC0aaXkwoemzFcpx.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KepmQC0aaXkwoemzFcpx.pdf


Processing PDFs:  13%|█▎        | 164/1302 [32:37<2:48:46,  8.90s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/oe1Qy0j8rLLfR2rPFHiU.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/oe1Qy0j8rLLfR2rPFHiU.pdf


Processing PDFs:  13%|█▎        | 165/1302 [32:40<2:14:09,  7.08s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Z4Mv4uxS2Hh6oUJmzGpc.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Z4Mv4uxS2Hh6oUJmzGpc.pdf


Processing PDFs:  13%|█▎        | 166/1302 [32:41<1:41:22,  5.35s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/K5jxz9DRzYTpUCJMXE4q.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/K5jxz9DRzYTpUCJMXE4q.pdf


Processing PDFs:  13%|█▎        | 167/1302 [32:44<1:25:54,  4.54s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Application_20Format_20in_20DOC_EM.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Application_20Format_20in_20DOC_EM.pdf


Processing PDFs:  13%|█▎        | 168/1302 [32:47<1:20:00,  4.23s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ELRE494dYwI52jBxWGc0.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ELRE494dYwI52jBxWGc0.pdf


Processing PDFs:  13%|█▎        | 169/1302 [32:51<1:15:18,  3.99s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/qNqsQRiyVnXiWadl5Lrb.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/qNqsQRiyVnXiWadl5Lrb.pdf


Processing PDFs:  13%|█▎        | 170/1302 [33:07<2:27:23,  7.81s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  13%|█▎        | 171/1302 [33:09<1:53:51,  6.04s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CNBNsq75rVIOotskMdUB.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CNBNsq75rVIOotskMdUB.pdf


Processing PDFs:  13%|█▎        | 172/1302 [33:11<1:27:18,  4.64s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/JDHPthCmI1XLniE1miyx.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/JDHPthCmI1XLniE1miyx.pdf


Processing PDFs:  13%|█▎        | 173/1302 [33:15<1:23:37,  4.44s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KJDC5NFBT8F12Eud6tLp.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KJDC5NFBT8F12Eud6tLp.pdf


Processing PDFs:  13%|█▎        | 174/1302 [33:38<3:08:46, 10.04s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rESMfsjlSfmPtywuQImb.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rESMfsjlSfmPtywuQImb.pdf


Processing PDFs:  13%|█▎        | 175/1302 [34:03<4:32:58, 14.53s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/6wW966mr1iE5IzAhO9HX.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/6wW966mr1iE5IzAhO9HX.pdf


Processing PDFs:  14%|█▎        | 176/1302 [34:06<3:27:48, 11.07s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/grpmFkAxvLxfY7st3hTY.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/grpmFkAxvLxfY7st3hTY.pdf


Processing PDFs:  14%|█▎        | 177/1302 [34:07<2:29:50,  7.99s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rmQghQOMr5JCbjsnGAMP.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rmQghQOMr5JCbjsnGAMP.pdf


Processing PDFs:  14%|█▎        | 178/1302 [34:29<3:52:37, 12.42s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nmP5sYQ7almkjqaJWlKW.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nmP5sYQ7almkjqaJWlKW.pdf


Processing PDFs:  14%|█▎        | 179/1302 [34:31<2:54:14,  9.31s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/UIvjKXQ5RTXCu0387aIO.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/CorMakers.PDF


Processing PDFs:  14%|█▍        | 180/1302 [37:03<16:12:07, 51.99s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/5f6HgBiGH72RsEuZXSzn.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/5f6HgBiGH72RsEuZXSzn.pdf


Processing PDFs:  14%|█▍        | 181/1302 [37:06<11:36:26, 37.28s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rTiMO1dNbs872AR3TNtn.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/rTiMO1dNbs872AR3TNtn.pdf


Processing PDFs:  14%|█▍        | 182/1302 [37:07<8:13:28, 26.44s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nlwZItYi6gMim7duvPAM.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/nlwZItYi6gMim7duvPAM.pdf


Processing PDFs:  14%|█▍        | 183/1302 [37:14<6:22:44, 20.52s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/3f9x2cJiDiRQEnnhS20P.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/3f9x2cJiDiRQEnnhS20P.pdf


Processing PDFs:  14%|█▍        | 184/1302 [37:15<4:34:13, 14.72s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MjBkRY4QNgELOW54Ctk8.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MjBkRY4QNgELOW54Ctk8.pdf


Processing PDFs:  14%|█▍        | 185/1302 [37:19<3:36:59, 11.66s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MwcOMlJt1U8RiL3oKnup.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MwcOMlJt1U8RiL3oKnup.pdf


Processing PDFs:  14%|█▍        | 186/1302 [37:20<2:35:47,  8.38s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  14%|█▍        | 187/1302 [37:25<2:17:52,  7.42s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/c0l4uiDphdYTpQd5mZNQ.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/c0l4uiDphdYTpQd5mZNQ.pdf


Processing PDFs:  14%|█▍        | 188/1302 [37:27<1:47:49,  5.81s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Feug6Y1Ms7tA4Ay6Y4aT.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Feug6Y1Ms7tA4Ay6Y4aT.pdf


Processing PDFs:  15%|█▍        | 189/1302 [37:30<1:28:04,  4.75s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MhQbhQcQJyKeJQ2Qi71Q.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MhQbhQcQJyKeJQ2Qi71Q.pdf


Processing PDFs:  15%|█▍        | 190/1302 [37:30<1:05:09,  3.52s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ct0bPzxFREBtWDAm0XP2.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ct0bPzxFREBtWDAm0XP2.pdf


Processing PDFs:  15%|█▍        | 191/1302 [37:31<49:12,  2.66s/it]  

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/z4R9D4aILvDyr1gsPiBl.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/z4R9D4aILvDyr1gsPiBl.pdf


Processing PDFs:  15%|█▍        | 192/1302 [37:34<52:37,  2.84s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/SXKX8LomqiDF3OcITyAq.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/SXKX8LomqiDF3OcITyAq.pdf


Processing PDFs:  15%|█▍        | 193/1302 [37:35<40:02,  2.17s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/AXuFNe6Dg4ib5yNDl04k.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/AXuFNe6Dg4ib5yNDl04k.pdf


Processing PDFs:  15%|█▍        | 194/1302 [37:35<30:58,  1.68s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/6QKzBwlZCGbLZufCilF1.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/6QKzBwlZCGbLZufCilF1.pdf


Processing PDFs:  15%|█▍        | 195/1302 [37:38<36:12,  1.96s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/zrfqz0K3j6BS21pYcJdO.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/zrfqz0K3j6BS21pYcJdO.pdf


Processing PDFs:  15%|█▌        | 196/1302 [37:40<36:09,  1.96s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/kGfDOO39bkIygSByChFv.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/kGfDOO39bkIygSByChFv.pdf


Processing PDFs:  15%|█▌        | 197/1302 [37:41<29:51,  1.62s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/8y9aruppw4BUDppOuUPI.pdf


    1 [tesseract] took too long to OCR - skipping
Postprocessing...


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/8y9aruppw4BUDppOuUPI.pdf


Processing PDFs:  15%|█▌        | 198/1302 [37:43<34:47,  1.89s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/usH88golk5hCVCtMAI3x.pdf


Optimize ratio: 1.47 savings: 31.9%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/usH88golk5hCVCtMAI3x.pdf


Processing PDFs:  15%|█▌        | 199/1302 [37:44<26:49,  1.46s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CkAbVzCfb4j04kAUi1zY.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/CkAbVzCfb4j04kAUi1zY.pdf


The output file size is 4.44× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)

Processing PDFs:  15%|█▌        | 200/1302 [37:44<22:12,  1.21s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/9SwIGucTBepe7asJBQrm.pdf


Start processing 3 pages concurrently
    1 [tesseract] took too long to OCR - skipping
    3 [tesseract] took too long to OCR - skipping
    2 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.00 savings: 0.0%
Output file is a PDF/A-2B (as expected)
The output file size is 1.80× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)



[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/PrebidCFDCOutlet02.PDF


Processing PDFs:  15%|█▌        | 201/1302 [43:38<32:40:30, 106.84s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/uUEAZRDjy9j7UUgWVFG6.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/uUEAZRDjy9j7UUgWVFG6.pdf


Processing PDFs:  16%|█▌        | 202/1302 [43:41<23:08:37, 75.74s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/czTS6QO5eZRbnLBCvOTa.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/czTS6QO5eZRbnLBCvOTa.pdf


Processing PDFs:  16%|█▌        | 203/1302 [43:42<16:15:20, 53.25s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/SYEiGqvI1k9xxDBaEyFP.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/SYEiGqvI1k9xxDBaEyFP.pdf


Processing PDFs:  16%|█▌        | 204/1302 [43:44<11:33:20, 37.89s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ZVHdszZYHSrBXNwyeX47.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ZVHdszZYHSrBXNwyeX47.pdf


Processing PDFs:  16%|█▌        | 205/1302 [43:46<8:15:15, 27.09s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/HO03BJQwnP65OdvkmuFu.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/HO03BJQwnP65OdvkmuFu.pdf


Processing PDFs:  16%|█▌        | 206/1302 [43:46<5:47:44, 19.04s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/5JejzLN7j6bUSvg3Ruit.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/5JejzLN7j6bUSvg3Ruit.pdf


Processing PDFs:  16%|█▌        | 207/1302 [43:49<4:20:40, 14.28s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/jgN7kdxXVlmS0LTY8Nd2.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/jgN7kdxXVlmS0LTY8Nd2.pdf


Processing PDFs:  16%|█▌        | 208/1302 [43:50<3:09:04, 10.37s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/kj8gU9d3nAjN3HLUlVdg.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/kj8gU9d3nAjN3HLUlVdg.pdf


Processing PDFs:  16%|█▌        | 209/1302 [43:57<2:49:57,  9.33s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wyhcPrKYwextFdU5BeyF.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wyhcPrKYwextFdU5BeyF.pdf


Processing PDFs:  16%|█▌        | 210/1302 [44:00<2:13:34,  7.34s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KTlDd6l4JCEVDrHZHIne.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/KTlDd6l4JCEVDrHZHIne.pdf


Processing PDFs:  16%|█▌        | 211/1302 [44:13<2:43:53,  9.01s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/IHJlCpiOweCmbUK5JJKn.pdf


Start processing 2 pages concurrently
    2 [tesseract] took too long to OCR - skipping
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Optimize ratio: 1.01 savings: 0.8%
Output file is a PDF/A-2B (as expected)


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_ltc/downloads/documents/leave_Approving Authority.pdf


Processing PDFs:  16%|█▋        | 212/1302 [50:03<33:43:57, 111.41s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wExm218oGoREEaTiQpYc.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wExm218oGoREEaTiQpYc.pdf


Processing PDFs:  16%|█▋        | 213/1302 [50:04<23:37:49, 78.12s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/IhFj0WtgJKL2X1VOoau9.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/IhFj0WtgJKL2X1VOoau9.pdf


Processing PDFs:  16%|█▋        | 214/1302 [50:04<16:33:07, 54.77s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_committe.php/downloads/documents/2021-Constitution of SEC (MCM and others).pdf


Processing PDFs:  17%|█▋        | 215/1302 [50:05<11:42:52, 38.80s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RJKcjYW2hePfUJqbktHK.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RJKcjYW2hePfUJqbktHK.pdf


Processing PDFs:  17%|█▋        | 216/1302 [50:06<8:15:25, 27.37s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MCfjdSrvmrQJ2nxy9Bon.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/9sqfPGWGuvSwu4jOQOUz.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/9sqfPGWGuvSwu4jOQOUz.pdf


Processing PDFs:  17%|█▋        | 217/1302 [50:07<5:50:26, 19.38s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ACg6w1JlK8h69vhHRjYC.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/ACg6w1JlK8h69vhHRjYC.pdf


Processing PDFs:  17%|█▋        | 218/1302 [50:09<4:17:36, 14.26s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/au6fHqGfrWale738hXcN.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/MCfjdSrvmrQJ2nxy9Bon.pdf


Processing PDFs:  17%|█▋        | 219/1302 [50:13<3:19:10, 11.03s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/au6fHqGfrWale738hXcN.pdf


Processing PDFs:  17%|█▋        | 220/1302 [50:19<2:51:07,  9.49s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/TmOhun4equKwckcuiNpJ.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/TmOhun4equKwckcuiNpJ.pdf


Processing PDFs:  17%|█▋        | 221/1302 [50:28<2:51:09,  9.50s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/EMUyR8tEFmVBXsSuznA1.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/EMUyR8tEFmVBXsSuznA1.pdf

Processing PDFs:  17%|█▋        | 222/1302 [50:29<2:02:57,  6.83s/it]


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RrADER3DzVk8ZWVeKhzt.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RrADER3DzVk8ZWVeKhzt.pdf


Processing PDFs:  17%|█▋        | 223/1302 [50:29<1:29:43,  4.99s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wJSHlqKKFnozRucrpft5.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wJSHlqKKFnozRucrpft5.pdf


Processing PDFs:  17%|█▋        | 224/1302 [50:30<1:05:46,  3.66s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/WU9c1Xh7j2DLlKVjsWyc.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/WU9c1Xh7j2DLlKVjsWyc.pdf


Processing PDFs:  17%|█▋        | 225/1302 [50:33<1:01:31,  3.43s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/W6dYHYkeKj2My2tontFk.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/0oJgwY0O8Ol3SfeAvrXc.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/W6dYHYkeKj2My2tontFk.pdf


Processing PDFs:  17%|█▋        | 226/1302 [50:33<45:22,  2.53s/it]  

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/0kJxLGvpbl5tFAlL6Aep.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/0kJxLGvpbl5tFAlL6Aep.pdf


Processing PDFs:  17%|█▋        | 227/1302 [50:34<33:36,  1.88s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/pvfsfqjTyQlBzlVtw1rA.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/pvfsfqjTyQlBzlVtw1rA.pdf


Processing PDFs:  18%|█▊        | 228/1302 [50:34<28:31,  1.59s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RlIENiWzeXsyPlupPDQJ.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/RlIENiWzeXsyPlupPDQJ.pdf


Processing PDFs:  18%|█▊        | 229/1302 [50:36<26:42,  1.49s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/0oJgwY0O8Ol3SfeAvrXc.pdf


Processing PDFs:  18%|█▊        | 230/1302 [50:37<23:49,  1.33s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/h3Pnd20RAeU9toZAGWCp.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/h3Pnd20RAeU9toZAGWCp.pdf


Processing PDFs:  18%|█▊        | 231/1302 [50:37<20:04,  1.12s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/BWgTwekq2n4Aix6O8Aba.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/xtdqCuV7zkt3oSF1bd34.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/xtdqCuV7zkt3oSF1bd34.pdf


Processing PDFs:  18%|█▊        | 232/1302 [50:39<21:22,  1.20s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/2sGeVHrQVHnMaAWT5bcL.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/BWgTwekq2n4Aix6O8Aba.pdf


Processing PDFs:  18%|█▊        | 233/1302 [50:40<21:45,  1.22s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/2sGeVHrQVHnMaAWT5bcL.pdf


Processing PDFs:  18%|█▊        | 234/1302 [50:41<19:23,  1.09s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/125apgNtvZ31YgVyoDfT.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/125apgNtvZ31YgVyoDfT.pdf


Processing PDFs:  18%|█▊        | 235/1302 [50:43<24:43,  1.39s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/skg16NU6PELqT77uJtLG.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/skg16NU6PELqT77uJtLG.pdf


Processing PDFs:  18%|█▊        | 236/1302 [50:44<23:20,  1.31s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Hindi_20Shabdavali.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Awzw0GNiyEfYB76zfU7I.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  18%|█▊        | 237/1302 [50:58<1:30:22,  5.09s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Awzw0GNiyEfYB76zfU7I.pdf


Processing PDFs:  18%|█▊        | 238/1302 [51:02<1:25:01,  4.79s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  18%|█▊        | 239/1302 [51:05<1:13:44,  4.16s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:  18%|█▊        | 240/1302 [51:06<1:01:00,  3.45s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  19%|█▊        | 241/1302 [51:08<52:24,  2.96s/it]  

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Final_20Draft_20Advtt._20November_202024_20-_20School_20of_20Innovation_20-_20Hindi..pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Final_20Draft_20Advtt._20November_202024_20-_20School_20of_20Innovation_20-_20Hindi..pdf


Processing PDFs:  19%|█▊        | 242/1302 [51:12<58:08,  3.29s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Corrigendum-Notification.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Corrigendum-Notification.pdf


Processing PDFs:  19%|█▊        | 243/1302 [51:16<1:01:50,  3.50s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/VS5yf5CIVpIITNt7qyZW.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/VS5yf5CIVpIITNt7qyZW.pdf


Processing PDFs:  19%|█▊        | 244/1302 [51:26<1:34:37,  5.37s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Final_20Draft_20Advtt._20September_202024_20-_20Hindi1.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Final_20Draft_20Advtt._20September_202024_20-_20Hindi1.pdf


Processing PDFs:  19%|█▉        | 245/1302 [51:35<1:51:27,  6.33s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Lw7kmBlBEmhiLGKFet0t.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Lw7kmBlBEmhiLGKFet0t.pdf


Processing PDFs:  19%|█▉        | 246/1302 [51:43<2:04:20,  7.06s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_iiti.ac.in_recruitments_faculty_positions/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  19%|█▉        | 247/1302 [52:16<4:19:19, 14.75s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_academic.iiti.ac.in_scholarshipcss.php/downloads/documents/Income Certifivcate Issuing Authority in various states or Union Terrirories .pdf


Processing PDFs:  19%|█▉        | 248/1302 [52:22<3:30:24, 11.98s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Newsletter_Vol_III.pdf


Start processing 8 pages concurrently
    1 page already has text! - rasterizing text and running OCR anyway
    2 page already has text! - rasterizing text and running OCR anyway
    4 page already has text! - rasterizing text and running OCR anyway
    6 page already has text! - rasterizing text and running OCR anyway
    5 page already has text! - rasterizing text and running OCR anyway
    3 page already has text! - rasterizing text and running OCR anyway
    8 page already has text! - rasterizing text and running OCR anyway
    7 page already has text! - rasterizing text and running OCR anyway
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.43 savings: 29.8%
Output file is a PDF/A-2B (as expected)
The output file size is 1.84× larger than the input file.
Possible reasons for this include:
The argument --force-ocr

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/Xerox_20Scan_13032025151501.PDF


Processing PDFs:  19%|█▉        | 249/1302 [57:42<30:31:58, 104.39s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  19%|█▉        | 250/1302 [57:47<21:48:09, 74.61s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:  19%|█▉        | 251/1302 [57:51<15:34:53, 53.37s/it]The PDF <_io.BufferedReader name='/home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/LinkedPdftoBannerPhoto.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/LinkedPdftoBannerPhoto.pdf


The PDF <_io.BufferedReader name='/home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/LinkedPdftoBannerPhoto.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/LinkedPdftoBannerPhoto.pdf


Processing PDFs:  19%|█▉        | 252/1302 [57:53<11:07:10, 38.12s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  19%|█▉        | 253/1302 [57:58<8:14:22, 28.28s/it] 

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Standard_20Operating_20Procedure_20(SoP)_20for_20YAAA-2025.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Standard_20Operating_20Procedure_20(SoP)_20for_20YAAA-2025.pdf


Processing PDFs:  20%|█▉        | 254/1302 [58:01<5:56:43, 20.42s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/ADC2024_5_Details.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/ADC2024_5_Details.pdf


Processing PDFs:  20%|█▉        | 255/1302 [58:05<4:30:58, 15.53s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/2_IIT_20Indore_20Dissertation_20Policy_20for_20External_20Students.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/2_IIT_20Indore_20Dissertation_20Policy_20for_20External_20Students.pdf


Processing PDFs:  20%|█▉        | 256/1302 [58:08<3:25:38, 11.80s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Anti-Ragging Committee Details.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Anti-Ragging Committee Details.pdf


Processing PDFs:  20%|█▉        | 257/1302 [58:09<2:30:52,  8.66s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  20%|█▉        | 258/1302 [58:41<4:33:15, 15.70s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201021 Policy for Appointment of Visiting Faculty Members.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201021 Policy for Appointment of Visiting Faculty Members.pdf


Processing PDFs:  20%|█▉        | 259/1302 [58:47<3:41:37, 12.75s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_Leave.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_Leave.pdf


Processing PDFs:  20%|█▉        | 260/1302 [58:55<3:16:11, 11.30s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201013 Policy for Appointment of Regular Faculty Members.pdf



    4 [tesseract] took too long to OCR - skipping


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201013 Policy for Appointment of Regular Faculty Members.pdf


Processing PDFs:  20%|██        | 261/1302 [59:11<3:39:13, 12.64s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201116 Policy for Appointment of Adjunct Faculty Members.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20201116 Policy for Appointment of Adjunct Faculty Members.pdf

Processing PDFs:  20%|██        | 262/1302 [59:11<2:37:14,  9.07s/it]


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_LTC.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_LTC.pdf


Processing PDFs:  20%|██        | 263/1302 [59:19<2:28:54,  8.60s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_CEA.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main/downloads/documents/leave_Approving Authority.pdf


Processing PDFs:  20%|██        | 264/1302 [59:24<2:07:58,  7.40s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/leave_Approving Authority.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_CEA.pdf


Processing PDFs:  20%|██        | 265/1302 [59:26<1:43:21,  5.98s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/9cbb1590c00a2cefdd1ee83a400de87c.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/9cbb1590c00a2cefdd1ee83a400de87c.pdf


Processing PDFs:  20%|██        | 266/1302 [59:38<2:13:50,  7.75s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20230406_Faculty Handbook_IITI_6april2023.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/20230406_Faculty Handbook_IITI_6april2023.pdf


Processing PDFs:  21%|██        | 267/1302 [1:00:40<6:54:39, 24.04s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_EWS.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_EWS.pdf


Processing PDFs:  21%|██        | 268/1302 [1:00:53<5:56:43, 20.70s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_Sexual Harrassment.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_Sexual Harrassment.pdf


Processing PDFs:  21%|██        | 269/1302 [1:00:54<4:14:38, 14.79s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_Reservation.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/10-Pre-Bid.PDF


Processing PDFs:  21%|██        | 270/1302 [1:00:57<3:11:48, 11.15s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_PwD.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_tendercs/downloads/documents/005-Corrigendum.PDF


Processing PDFs:  21%|██        | 271/1302 [1:00:58<2:19:27,  8.12s/it]

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/FAQ_PwD.pdf


Processing PDFs:  21%|██        | 272/1302 [1:00:59<1:44:11,  6.07s/it]Start processing 13 pages concurrently


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/IT ACT, 1961 including amendments.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_www.iiti.ac.in_recruitments_non-teaching-recruitment/downloads/documents/wffkGgaFStKmSsBcM5H7.pdf

Processing PDFs:  21%|██        | 273/1302 [1:01:51<5:38:26, 19.73s/it]


[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway


[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/IT ACT, 1961 including amendments.pdf


Processing PDFs:  21%|██        | 274/1302 [1:01:56<4:21:32, 15.26s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  21%|██        | 275/1302 [1:02:04<3:48:02, 13.32s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:  21%|██        | 276/1302 [1:02:06<2:49:48,  9.93s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  21%|██▏       | 277/1302 [1:02:12<2:28:27,  8.69s/it]    3 [tesseract] took too long to OCR - skipping


[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/IITI Statutes 04102017.pdf
[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_page_centers/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  21%|██▏       | 278/1302 [1:02:35<3:40:23, 12.91s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.42 savings: 29.8%
Output file is a PDF/A-2B (as expected)
    1 [tesseract] took too long to OCR - skipping
Postprocessing...
Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.
Optimize ratio: 1.28 savings: 21.7%
The output file size is 1.85× larger than the input file.
Possible reasons for this include:
The argument --force-ocr was issued, causing transcoding.
The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.
PDF/A conversion was enabled. (Try `--output-type pdf`.)

Output file is a PDF/A-2B (as expected)
The output file size is 4.94× larger than the input file.
Possible reasons for this include:
The argument --force-ocr

[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_facultyaffairs.iiti.ac.in_main_pay_matters/downloads/documents/IITI Statutes 04102017.pdf


Processing PDFs:  21%|██▏       | 279/1302 [1:03:22<6:36:25, 23.25s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Green_20vehicle_20schedule-_202022-23.pdf


Processing PDFs:  22%|██▏       | 280/1302 [1:03:26<4:57:11, 17.45s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Mission_20Vision_20Objectives_20of_20IITI_20-_20Hindi.pdf


Processing PDFs:  22%|██▏       | 281/1302 [1:03:31<3:51:35, 13.61s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Mission_Vision_Objectives.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Mission_Vision_Objectives.pdf


Processing PDFs:  22%|██▏       | 282/1302 [1:03:35<3:02:50, 10.76s/it]

[Text] Detected native text PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Hindi_20Shabdavali.pdf
[✓] Processed: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/http_www.iiti.ac.in_page_csir-ceeri/downloads/documents/Hindi_20Shabdavali.pdf


Processing PDFs:  22%|██▏       | 283/1302 [1:03:52<3:34:44, 12.64s/it]

[OCR] Detected scanned PDF: /home/saranshvashistha/workspace/AIML-018-IITI-SoC/data/dataset-20250624T183251Z-1-001/dataset/https_iiti.ac.in_hindi_director/downloads/documents/IITIPAN.pdf


    1 page already has text! - rasterizing text and running OCR anyway
