In [None]:
# Install required libraries if needed
!pip install python-pptx pypdf

In [None]:
import os
from pptx import Presentation
from pypdf import PdfReader

# Folder locations
INPUT_DIR = "../course_materials"   # where your PPTX/PDF slides live
OUTPUT_DIR = "../processed_texts"   # where we store extracted text

os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text_from_pptx(path):
    prs = Presentation(path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def extract_text_from_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text


# Process all files
for filename in os.listdir(INPUT_DIR):
    src_path = os.path.join(INPUT_DIR, filename)
    
    if filename.lower().endswith(".pptx"):
        print("Extracting PPTX:", filename)
        text = extract_text_from_pptx(src_path)
    
    elif filename.lower().endswith(".pdf"):
        print("Extracting PDF:", filename)
        text = extract_text_from_pdf(src_path)
    
    else:
        continue

    # Save text to output folder
    out_name = os.path.splitext(filename)[0] + ".txt"
    out_path = os.path.join(OUTPUT_DIR, out_name)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Saved extracted text → {out_path}")

print("\n✅ Extraction complete!")


Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
     -------------------------------------- 472.8/472.8 kB 7.3 MB/s eta 0:00:00
Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl (329 kB)
     ------------------------------------- 329.5/329.5 kB 10.3 MB/s eta 0:00:00
Collecting XlsxWriter>=0.5.7
  Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
     ------------------------------------- 175.3/175.3 kB 10.3 MB/s eta 0:00:00
Installing collected packages: XlsxWriter, pypdf, python-pptx
Successfully installed XlsxWriter-3.2.9 pypdf-6.4.0 python-pptx-1.0.2



[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Extracting PPTX: 1_DL_Setting_the_Scene.pptx
Saved extracted text → ../processed_texts\1_DL_Setting_the_Scene.txt
Extracting PPTX: 2_DL_Adv_Deep_Learning.pptx
Saved extracted text → ../processed_texts\2_DL_Adv_Deep_Learning.txt
Extracting PPTX: 3_DL_ComputerVision_Classification.pptx
Saved extracted text → ../processed_texts\3_DL_ComputerVision_Classification.txt
Extracting PPTX: 4_DL_CV_Object_Detection.pptx
Saved extracted text → ../processed_texts\4_DL_CV_Object_Detection.txt

✅ Extraction complete!
