In [1]:
# Install required libraries if needed
!pip install python-pptx pypdf




[notice] A new release of pip available: 22.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from pptx import Presentation
from pypdf import PdfReader

# Folder locations
INPUT_DIR = "../course_materials"   # where your PPTX/PDF slides live
OUTPUT_DIR = "../processed_texts"   # where we store extracted text

os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text_from_pptx(path):
    prs = Presentation(path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def extract_text_from_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text


# Process all files
for filename in os.listdir(INPUT_DIR):
    src_path = os.path.join(INPUT_DIR, filename)
    
    if filename.lower().endswith(".pptx"):
        print("Extracting PPTX:", filename)
        text = extract_text_from_pptx(src_path)
    
    elif filename.lower().endswith(".pdf"):
        print("Extracting PDF:", filename)
        text = extract_text_from_pdf(src_path)
    
    else:
        continue

    # Save text to output folder
    out_name = os.path.splitext(filename)[0] + ".txt"
    out_path = os.path.join(OUTPUT_DIR, out_name)

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Saved extracted text → {out_path}")

print("\n✅ Extraction complete!")


Extracting PPTX: 10_1_CLOUD_AI_Data_Science_recap.pptx
Saved extracted text → ../processed_texts\10_1_CLOUD_AI_Data_Science_recap.txt
Extracting PPTX: 10_2_CLOUD_AI_Working_environment_overview.pptx
Saved extracted text → ../processed_texts\10_2_CLOUD_AI_Working_environment_overview.txt
Extracting PPTX: 10_3_CLOUD_AI_Weapons_of_math_destruction.pptx
Saved extracted text → ../processed_texts\10_3_CLOUD_AI_Weapons_of_math_destruction.txt
Extracting PPTX: 10_4_CLOUD_AI_Setting_up_your_working_environment.pptx
Saved extracted text → ../processed_texts\10_4_CLOUD_AI_Setting_up_your_working_environment.txt
Extracting PPTX: 11_CLOUD_AI_Our_first_model.pptx
Saved extracted text → ../processed_texts\11_CLOUD_AI_Our_first_model.txt
Extracting PPTX: 12_CLOUD_AI_Model_quality.pptx
Saved extracted text → ../processed_texts\12_CLOUD_AI_Model_quality.txt
Extracting PPTX: 13_CLOUD_AI_Models.pptx
Saved extracted text → ../processed_texts\13_CLOUD_AI_Models.txt
Extracting PPTX: 14_CLOUD_AI_Data_augmenta