## Working With PPTX

### 1. Module Import

In [4]:
import os
from pptx import Presentation
from PIL import Image
from io import BytesIO
from pathlib import Path

#### 2. Handling Data

#### Path

In [5]:
pptx_path = '../data_import/Networking.pptx'
target_path = Path('../data_export/file_pptx')

target_text_path = target_path / 'extracted_text'
target_image_path = target_path / 'extracted_images'

os.makedirs(target_text_path, exist_ok=True)
os.makedirs(target_image_path, exist_ok=True)

#### Extract Text

In [6]:
def extract_text(slide):
    slide_text = ""
    for shape in slide.shapes:
        if hasattr(shape, "text"):
            slide_text += shape.text + "\n"
    return slide_text

prs = Presentation(pptx_path)

for i, slide in enumerate(prs.slides):
    slide_text = extract_text(slide)
    text_file_path = os.path.join(target_text_path, f"slide_{i + 1}.txt")
    with open(text_file_path, 'w', encoding='utf-8') as text_file:
        text_file.write(slide_text)
    print(f"Text from slide {i + 1} has been saved to {text_file_path}")

print("Text extraction completed successfully.")


Text from slide 1 has been saved to ../data_export/file_pptx/extracted_text/slide_1.txt
Text from slide 2 has been saved to ../data_export/file_pptx/extracted_text/slide_2.txt
Text from slide 3 has been saved to ../data_export/file_pptx/extracted_text/slide_3.txt
Text from slide 4 has been saved to ../data_export/file_pptx/extracted_text/slide_4.txt
Text from slide 5 has been saved to ../data_export/file_pptx/extracted_text/slide_5.txt
Text from slide 6 has been saved to ../data_export/file_pptx/extracted_text/slide_6.txt
Text from slide 7 has been saved to ../data_export/file_pptx/extracted_text/slide_7.txt
Text from slide 8 has been saved to ../data_export/file_pptx/extracted_text/slide_8.txt
Text from slide 9 has been saved to ../data_export/file_pptx/extracted_text/slide_9.txt
Text from slide 10 has been saved to ../data_export/file_pptx/extracted_text/slide_10.txt
Text from slide 11 has been saved to ../data_export/file_pptx/extracted_text/slide_11.txt
Text from slide 12 has been 

#### Extract Images

In [10]:
def extract_images(slide, slide_num):
    for shape in slide.shapes:
        if shape.shape_type == 13:  # Shape type 13 corresponds to pictures
            image = shape.image
            image_bytes = image.blob
            img = Image.open(BytesIO(image_bytes))
            image_path = os.path.join(target_image_path, f"slide_{slide_num}_image_{shape.shape_id}.png")
            img.save(image_path)
            print(f"Saved image: {image_path}")

prs = Presentation(pptx_path)

for i, slide in enumerate(prs.slides):
    extract_images(slide, i + 1)

print("Image extraction completed successfully.")

Saved image: ../data_export/file_pptx/extracted_images/slide_2_image_84.png
Saved image: ../data_export/file_pptx/extracted_images/slide_2_image_85.png
Saved image: ../data_export/file_pptx/extracted_images/slide_3_image_93.png
Saved image: ../data_export/file_pptx/extracted_images/slide_5_image_107.png
Saved image: ../data_export/file_pptx/extracted_images/slide_10_image_159.png
Saved image: ../data_export/file_pptx/extracted_images/slide_10_image_160.png
Saved image: ../data_export/file_pptx/extracted_images/slide_11_image_183.png
Saved image: ../data_export/file_pptx/extracted_images/slide_11_image_184.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_235.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_239.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_244.png
Saved image: ../data_export/file_pptx/extracted_images/slide_13_image_272.png
Saved image: ../data_export/file_pptx/extracted_images/slide_13_image_2