In [1]:
"""
This notebook is for uploading and extracting text from CTSE lecture PDFs and PPTX files.
"""

'\nThis notebook is for uploading and extracting text from CTSE lecture PDFs and PPTX files.\n'

In [None]:
import os
from pathlib import Path
import fitz  # PyMuPDF for PDFs
from pptx import Presentation

In [None]:
# Directory for uploads
UPLOAD_DIR = Path("../data")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# --- Helper functions ---
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

def extract_text_from_pptx(pptx_path):
    text = ""
    prs = Presentation(pptx_path)
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def save_clean_text(file_name, text):
    output_path = UPLOAD_DIR / f"{file_name}_extracted.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)
    return output_path

In [None]:
# --- Main pipeline ---
def ingest_file(file_path):
    ext = file_path.suffix.lower()
    if ext == ".pdf":
        raw_text = extract_text_from_pdf(file_path)
    elif ext == ".pptx":
        raw_text = extract_text_from_pptx(file_path)
    else:
        raise ValueError("Unsupported file type: only PDF and PPTX allowed")

    file_name = file_path.stem
    cleaned_path = save_clean_text(file_name, raw_text)
    print(f"Saved cleaned text to: {cleaned_path}")
    return cleaned_path

# --- Example Usage ---
# In Jupyter, use:
# from pathlib import Path
# ingest_file(Path("../data/ctse_lecture_notes.pdf"))