# 必要なパッケージのインストール
- 初回のみ

In [6]:
!pip install gTTS
!pip install pydub

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


# フォントのアップロード

In [7]:
# font
import os
from google.colab import files
uploaded_font = files.upload()
font_path = list(uploaded_font.keys())[0]
print(f'font path: {font_path}')

Saving meiryo.ttc to meiryo.ttc
font: meiryo.ttc


# 設定ファイルの作成
- 基本的には初回のみ
- 設定値を変更した場合は再度実行

In [8]:
# config
import configparser

config = configparser.ConfigParser()

config["Subtitle"] = {
    "font_size": 50,
    "band_size": (1920, 120),
    "text_color": (255, 255, 0),
    "background_color": (255, 0, 0),
}

with open('config.ini', 'w') as configfile:
    config.write(configfile)

# 字幕用テキストのアップロード
- txt 形式のファイルをアップロード

In [12]:
# script
import os
from google.colab import files
uploaded_script = files.upload()
script_path = list(uploaded_script.keys())[0]
print(f'script path: {script_path}')

Saving script.txt to script.txt
script path: script.txt


# 動画の生成
- フォント、config.ini、字幕用テキストファイルが存在することを確認して実行
- outputフォルダ内にページごとの動画が出力

In [13]:
# VoiceCaptioner
import cv2
import numpy as np
from gtts import gTTS
from PIL import Image, ImageDraw, ImageFont
import os
from pydub.utils import mediainfo
from typing import List
import subprocess
import shutil
import configparser
import sys

class MixIn:
    def remove_all_files(self, dir_name):
        for file_name in os.listdir(dir_name):
            file_path = os.path.join(dir_name, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)

class GenerateSubtitle(MixIn):
    def __init__(
            self,
            font_path: str,
            font_size: int,
            band_size: List,
            background_color: List,
            text_color: List,
        ):
        self.font_path = font_path
        self.font_size = font_size
        self.band_size = band_size
        self.background_color = background_color
        self.text_color = text_color
        self.temp_dir = os.path.abspath('temp')
        if not os.path.exists(self.temp_dir):
            os.mkdir(self.temp_dir)
        else:
            print(f'{self.temp_dir} already exists')

    @staticmethod
    def get_audio_duration(audio_file):
        info = mediainfo(audio_file)
        return float(info["duration"])

    def create_text_video(self, text, output_file):
        # text
        font = ImageFont.truetype(self.font_path, self.font_size)
        width, height = self.band_size
        img = Image.new('RGB', (width, height), self.background_color)
        draw = ImageDraw.Draw(img)
        text_bbox = draw.textbbox((0, 0), text, font=font)
        text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
        text_x = (width - text_width) // 2
        text_y = (height - text_height) // 2
        draw.text((text_x, text_y), text, fill=self.text_color, font=font)
        # sound
        tts = gTTS(text=text, lang='ja', slow=False)
        audio_file = f"{self.temp_dir}/temp_audio.mp3"
        tts.save(audio_file)
        duration = self.get_audio_duration(audio_file)
        # movie
        fps = 30
        frame_count = int(fps * duration)
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
        frame = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        for _ in range(frame_count):
            video.write(frame)
        video.release()
        # movie + sound
        temp_video = f"{self.temp_dir}/temp_video.mp4"
        os.rename(output_file, temp_video)
        os.system(f"ffmpeg -i {temp_video} -i {audio_file} -c:v copy -c:a aac -strict experimental {output_file}")
        # remove temp files
        os.remove(temp_video)
        os.remove(audio_file)

    def concatenate_text_video(self, output_file):
        files = sorted([f for f in os.listdir(self.temp_dir) if f.endswith('.mp4')])
        list_file_path = os.path.join(self.temp_dir, "file_list.txt")
        with open(list_file_path, "w") as list_file:
            for file in files:
                full_path = os.path.join(self.temp_dir, file)
                list_file.write(f"file '{full_path}'\n")
        try:
            subprocess.run(
                ["ffmpeg", "-f", "concat", "-safe", "0", "-i", list_file_path, "-c", "copy", output_file],
            )
            print(f"Output file created: {output_file}")
        except subprocess.CalledProcessError as e:
            print(f"Error during concatenation: {e}")
        finally:
            self.remove_all_files(self.temp_dir)

    def main(self, text, output_file):
        # splited movie
        text_list = text.splitlines()
        digits = len(str(len(text_list)))
        for i, splited_text in enumerate(text_list):
            self.create_text_video(splited_text, f'{self.temp_dir}/{str(i).zfill(digits)}.mp4')
        self.concatenate_text_video(output_file)

if not os.path.exists('output'):
    os.makedirs('output')
mixin = MixIn()
mixin.remove_all_files('output')
config_ini = configparser.ConfigParser()
config_ini.read('config.ini', encoding='utf-8')
subtitle = config_ini['Subtitle']
# font_path = subtitle.get('font_path')
font_size = int(subtitle.get('font_size'))
band_size = eval(subtitle.get('band_size'))
text_color = eval(subtitle.get('text_color'))
background_color = eval(subtitle.get('background_color'))
gen = GenerateSubtitle(
    font_path, font_size, band_size, background_color, text_color,
)
with open(script_path) as f:
    all_text = f.read()
paragraphs = [paragraph.strip() for paragraph in all_text.strip().split("\n\n")]
for i, text in enumerate(paragraphs):
    gen.main(text, f"output/page_{i}.mp4")
shutil.rmtree(gen.temp_dir)

/content/temp already exists
Output file created: output/page_0.mp4
