In [2]:
import pandas as pd
import os
import glob
import pymupdf4llm

def extract_text(pdf_path):
    try:
        md_text = pymupdf4llm.to_markdown(pdf_path)
        return md_text
    except Exception as e:
        return f"Error: {str(e)}"

pdf_folder = 'pdfs'
pdf_files = glob.glob(os.path.join(pdf_folder, '*.pdf'))

print(f"\nFound {len(pdf_files)} PDF files")

result_ids = []
pdf_contents = []

for i, pdf_path in enumerate(pdf_files):
    filename = os.path.basename(pdf_path)
    result_id = os.path.splitext(filename)[0]

    print(f"[{i}] Processing: {filename}...")

    if extract_text:
        content = extract_text(pdf_path)
    else:
        content = "Error: No PDF library available"

    result_ids.append(result_id)
    pdf_contents.append(content)

df = pd.DataFrame({
    'result_id': result_ids,
    'pdf_content': pdf_contents
})

df['content_length'] = df['pdf_content'].str.len()
df['has_error'] = df['pdf_content'].str.startswith('Error')

print(f"Extraction Summary:")
print(f"\tTotal PDFs processed: {len(df)}")
print(f"\tSuccessfully extracted: {(~df['has_error']).sum()}")
print(f"\tFailed extractions: {df['has_error'].sum()}")



Found 31 PDF files
[0]Processing: VdBeSlL9_TMJ.pdf...
[1]Processing: tEdhwBtai2UJ.pdf...
[2]Processing: JYbhWHRjZR8J.pdf...
[3]Processing: vvWK674pjjMJ.pdf...
[4]Processing: X0Vwc_5HcwcJ.pdf...
[5]Processing: Vh3K8GLTg9sJ.pdf...
[6]Processing: BBM06Azg7dsJ.pdf...
[7]Processing: 8GGORR0MaSsJ.pdf...
[8]Processing: upmOB6l0mqcJ.pdf...
[9]Processing: L0bsiVYsSXAJ.pdf...
[10]Processing: o75KCLeHjDUJ.pdf...
[11]Processing: cOIHMy0teoQJ.pdf...
[12]Processing: 652uhLWfgHEJ.pdf...
[13]Processing: HBJ70w1pEbkJ.pdf...
[14]Processing: hyhgqgIFk54J.pdf...
[15]Processing: SrQA8RayKnEJ.pdf...
[16]Processing: MJMoFKogT5wJ.pdf...
[17]Processing: 92wqEI9nj3kJ.pdf...
[18]Processing: dM9--Pf9JwYJ.pdf...
[19]Processing: QIBcyXtn7KMJ.pdf...
[20]Processing: t5gKJgsKyx4J.pdf...
[21]Processing: aFuibW35LbkJ.pdf...
[22]Processing: 3NMfpZ-a1tMJ.pdf...
[23]Processing: jC68NnJ36K8J.pdf...
[24]Processing: DV5jL8OjxvAJ.pdf...
[25]Processing: Z4ivNADTsZkJ.pdf...
[26]Processing: DqVgd_mTbkAJ.pdf...
[27]Processing: 4B

In [None]:
df.head()

In [None]:
df.to_csv('pdf_extracted_text.csv', index=False)