## Working With PPTX

### 1. Module Import

In [17]:
import os
from pptx import Presentation
from PIL import Image
from io import BytesIO
from pathlib import Path
from deep_translator import GoogleTranslator
import shutil

#### 2. Handling Data

#### Path

In [18]:
pptx_path = '../data_import/Networking.pptx'
target_path = Path('../data_export/file_pptx')

target_text_path = target_path / 'extracted_text'
target_image_path = target_path / 'extracted_images'
target_translate_path = target_path / 'translated_pptx'

os.makedirs(target_text_path, exist_ok=True)
os.makedirs(target_image_path, exist_ok=True)
os.makedirs(target_translate_path, exist_ok=True)

#### Extract Text

In [23]:
def extract_text_from_pptx(pptx_file):
    prs = Presentation(pptx_file)
    text_runs = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    text_runs.append(run.text)
    return text_runs

def save_extracted_texts(texts, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for text in texts:
            f.write(text + '\n')

extracted_texts = extract_text_from_pptx(pptx_path)

extracted_texts_file = target_text_path / 'extracted_texts.txt'
save_extracted_texts(extracted_texts, extracted_texts_file)

print(f"Extracted texts saved to: {extracted_texts_file}")


Extracted texts saved to: ../data_export/file_pptx/extracted_text/extracted_texts.txt


#### Extract Images

In [10]:
def extract_images(slide, slide_num):
    for shape in slide.shapes:
        if shape.shape_type == 13:  # Shape type 13 corresponds to pictures
            image = shape.image
            image_bytes = image.blob
            img = Image.open(BytesIO(image_bytes))
            image_path = os.path.join(target_image_path, f"slide_{slide_num}_image_{shape.shape_id}.png")
            img.save(image_path)
            print(f"Saved image: {image_path}")

prs = Presentation(pptx_path)

for i, slide in enumerate(prs.slides):
    extract_images(slide, i + 1)

print("Image extraction completed successfully.")

Saved image: ../data_export/file_pptx/extracted_images/slide_2_image_84.png
Saved image: ../data_export/file_pptx/extracted_images/slide_2_image_85.png
Saved image: ../data_export/file_pptx/extracted_images/slide_3_image_93.png
Saved image: ../data_export/file_pptx/extracted_images/slide_5_image_107.png
Saved image: ../data_export/file_pptx/extracted_images/slide_10_image_159.png
Saved image: ../data_export/file_pptx/extracted_images/slide_10_image_160.png
Saved image: ../data_export/file_pptx/extracted_images/slide_11_image_183.png
Saved image: ../data_export/file_pptx/extracted_images/slide_11_image_184.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_235.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_239.png
Saved image: ../data_export/file_pptx/extracted_images/slide_12_image_244.png
Saved image: ../data_export/file_pptx/extracted_images/slide_13_image_272.png
Saved image: ../data_export/file_pptx/extracted_images/slide_13_image_2

### Translated from English to German

In [26]:
pptx_path = Path('../data_import/Networking.pptx')

copied_pptx_path = target_translate_path / pptx_path.name

shutil.copyfile(pptx_path, copied_pptx_path)

def read_extracted_texts(input_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    return [text.strip() for text in texts]

def translate_text(text, source_lang='auto', target_lang='german'):
    translator = GoogleTranslator(source=source_lang, target=target_lang)
    return translator.translate(text)

def rebuild_pptx_with_translations(original_pptx, translated_texts, output_pptx):
    prs = Presentation(original_pptx)
    text_index = 0
    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    run.text = translated_texts[text_index]
                    text_index += 1
    prs.save(output_pptx)

extracted_texts_file = target_text_path / 'extracted_texts.txt'
extracted_texts = read_extracted_texts(extracted_texts_file)

translated_texts = [translate_text(text) for text in extracted_texts]

output_path = target_translate_path / 'translated_pptx.pptx'
rebuild_pptx_with_translations(copied_pptx_path, translated_texts, output_path)

print(f"Translated PPTX file saved to: {output_path}")

Translated PPTX file saved to: ../data_export/file_pptx/translated_pptx/translated_pptx.pptx
