In [5]:
!pip install docx2pdf reportlab pdfkit xlsx2html pillow tqdm


Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Collecting reportlab
  Downloading reportlab-4.4.1-py3-none-any.whl.metadata (1.8 kB)
Collecting pdfkit
  Downloading pdfkit-1.0.0-py3-none-any.whl.metadata (9.3 kB)
Collecting xlsx2html
  Downloading xlsx2html-0.6.2-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting openpyxl>=2.4.8 (from xlsx2html)
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting babel>=2.3.4 (from xlsx2html)
  Downloading babel-2.17.0-py3-none-any.whl.metadata (2.0 kB)
Collecting et-xmlfile (from openpyxl>=2.4.8->xlsx2html)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Downloading reportlab-4.4.1-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import os
import platform
from pathlib import Path
from shutil import copy2
from tqdm import tqdm

from PIL import Image
from reportlab.pdfgen import canvas

# Detectar SO e configurar base path
IS_WINDOWS = platform.system() == "Windows"

if IS_WINDOWS:
    BASE_DIR = Path("C:/Users/wilso/MBA_COGNITIVE_ENVIROMENTS")
else:
    BASE_DIR = Path("/mnt/c/Users/wilso/MBA_COGNITIVE_ENVIROMENTS")

TARGET_DIR = BASE_DIR / "references"
TARGET_DIR.mkdir(parents=True, exist_ok=True)

# Tentativas de importação (docx2pdf, comtypes, etc.)
try:
    from docx2pdf import convert as convert_docx
except:
    convert_docx = None
    print("⚠️ docx2pdf não disponível. Arquivos .docx serão ignorados.")

try:
    import comtypes.client
except:
    comtypes = None
    print("⚠️ comtypes não disponível. Arquivos .pptx serão ignorados.")

try:
    import pdfkit
    from xlsx2html import xlsx2html
except:
    pdfkit = None
    xlsx2html = None
    print("⚠️ pdfkit/xlsx2html não disponíveis. Arquivos .xlsx serão ignorados.")

# Conversores

def convert_image_to_pdf(src, dst):
    img = Image.open(src).convert("RGB")
    img.save(dst)

def convert_text_to_pdf(src, dst):
    with open(src, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
    c = canvas.Canvas(str(dst))
    text = c.beginText(40, 800)
    for line in lines:
        text.textLine(line.strip())
        if text.getY() < 40:
            c.drawText(text)
            c.showPage()
            text = c.beginText(40, 800)
    c.drawText(text)
    c.save()

def convert_docx_to_pdf(src, dst):
    if not IS_WINDOWS or convert_docx is None:
        print(f"🚫 Ignorado (.docx): {src}")
        return
    tmp = BASE_DIR / f"__temp__{src.name}"
    copy2(src, tmp)
    convert_docx(tmp, TARGET_DIR)
    tmp.unlink()

def convert_pptx_to_pdf(src, dst):
    if not IS_WINDOWS or comtypes is None:
        print(f"🚫 Ignorado (.pptx): {src}")
        return
    powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
    powerpoint.Visible = 1
    pres = powerpoint.Presentations.Open(str(src), WithWindow=False)
    pres.SaveAs(str(dst), 32)  # 32 = PDF
    pres.Close()
    powerpoint.Quit()

def convert_xlsx_to_pdf(src, dst):
    if pdfkit is None or xlsx2html is None:
        print(f"🚫 Ignorado (.xlsx): {src}")
        return
    html_path = src.with_suffix(".html")
    with open(html_path, 'w', encoding='utf-8') as f:
        xlsx2html(src, f)
    pdfkit.from_file(str(html_path), str(dst))
    html_path.unlink()

# Processador

def process_file(file: Path):
    if file.suffix.lower() == '.pdf':
        return
    if TARGET_DIR in file.parents:
        return

    dst_file = TARGET_DIR / (file.stem + ".pdf")
    ext = file.suffix.lower()

    try:
        if ext in ['.jpg', '.jpeg', '.png']:
            convert_image_to_pdf(file, dst_file)
        elif ext in ['.txt', '.md', '.csv', '.py']:
            convert_text_to_pdf(file, dst_file)
        elif ext == '.docx':
            convert_docx_to_pdf(file, dst_file)
        elif ext == '.pptx':
            convert_pptx_to_pdf(file, dst_file)
        elif ext == '.xlsx':
            convert_xlsx_to_pdf(file, dst_file)
        else:
            print(f"❌ Ignorado (formato não suportado): {file}")
    except Exception as e:
        print(f"⚠️ Erro ao converter {file.name}: {e}")

# Execução com barra de progresso
all_files = list(BASE_DIR.rglob("*"))
for path in tqdm(all_files, desc="Convertendo arquivos"):
    if path.is_file():
        process_file(path)

print(f"\n✅ Conversão finalizada. Todos os PDFs estão em: {TARGET_DIR}")


⚠️ comtypes não disponível. Arquivos .pptx serão ignorados.


Convertendo arquivos: 100%|██████████| 1/1 [00:00<00:00, 5363.56it/s]


✅ Conversão finalizada. Todos os PDFs estão em: /mnt/c/Users/wilso/MBA_COGNITIVE_ENVIROMENTS/references



