In [22]:
import os
from moviepy.editor import VideoFileClip, concatenate_videoclips, TextClip, ImageClip, CompositeVideoClip, AudioFileClip
import pyttsx3 as tts
import whisper

def speak(text):
    engine = tts.init()
    engine.save_to_file(text, 'samp.mp3')
    engine.runAndWait()
    

def get_files_in_directory(directory):
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]


with open('samp.txt', 'r') as file:
    text = file.read()
    speak(text)


In [23]:
def speech_to_text(audio_file):
    model = whisper.load_model("base")
    data = model.transcribe(audio_file, word_timestamps=True)

    start = [data['segments'][i]['words'][j]['start'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]
    end = [data['segments'][i]['words'][j]['end'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]
    text = [data['segments'][i]['words'][j]['word'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]

    return [start, end, text]

speech = speech_to_text("samp.mp3")
speech

[[0.0,
  0.22,
  0.3,
  0.48,
  0.68,
  0.9,
  1.08,
  1.26,
  1.52,
  1.66,
  1.96,
  2.78,
  2.86,
  3.0,
  3.26,
  3.56,
  3.84,
  4.0,
  4.78,
  4.88,
  5.12,
  5.68,
  5.8,
  5.94,
  6.16,
  7.28,
  7.52,
  7.62,
  7.76,
  8.36,
  8.48,
  8.7,
  9.06,
  9.22,
  9.34,
  9.56,
  10.48,
  10.58,
  10.86,
  11.1,
  11.2,
  11.44,
  12.54,
  12.72,
  12.96,
  13.08,
  13.26,
  13.42,
  14.2,
  14.4,
  14.6,
  14.84,
  15.0,
  15.28,
  15.5,
  15.6,
  15.8,
  16.98,
  17.22,
  17.3,
  17.48,
  17.84,
  17.94,
  18.68,
  18.84,
  19.1,
  19.32,
  19.6,
  19.76,
  19.88,
  20.56,
  20.7,
  20.94,
  21.42,
  21.54,
  21.92,
  21.98,
  22.16,
  22.34,
  22.4,
  22.8,
  23.16,
  23.36,
  24.58,
  24.88,
  25.04,
  25.14,
  25.28,
  25.58,
  25.82,
  25.94,
  26.2,
  26.66,
  26.8,
  27.04,
  27.62,
  27.84,
  28.16,
  28.42,
  28.64,
  28.9,
  29.12,
  29.32,
  29.58,
  30.36,
  30.76,
  30.96,
  31.24,
  31.98,
  32.06,
  32.2,
  32.48,
  32.92,
  33.12,
  33.38,
  34.16,
  34.3,
  34.38,
 

In [24]:
for l in range(len(speech[0])):
    print(((speech[0][l], speech[1][l]), speech[2][l]))

((0.0, 0.22), ' As')
((0.22, 0.3), ' the')
((0.3, 0.48), ' moon')
((0.48, 0.68), ' cast')
((0.68, 0.9), ' an')
((0.9, 1.08), ' eerie')
((1.08, 1.26), ' glow')
((1.26, 1.52), ' over')
((1.52, 1.66), ' the')
((1.66, 1.96), ' desolate')
((1.96, 2.28), ' streets,')
((2.78, 2.86), ' a')
((2.86, 3.0), ' lone')
((3.0, 3.26), ' figure')
((3.26, 3.56), ' emerges')
((3.56, 3.84), ' from')
((3.84, 4.0), ' the')
((4.0, 4.26), ' shadows,')
((4.78, 4.88), ' their')
((4.88, 5.12), ' footsteps')
((5.12, 5.68), ' echoing')
((5.68, 5.8), ' like')
((5.8, 5.94), ' a')
((5.94, 6.16), ' sinister')
((6.16, 6.56), ' heartbeat.')
((7.28, 7.52), ' In')
((7.52, 7.62), ' the')
((7.62, 7.76), ' dim')
((7.76, 7.94), ' light,')
((8.36, 8.48), ' we')
((8.48, 8.7), ' catch')
((8.7, 9.06), ' glimpses')
((9.06, 9.22), ' of')
((9.22, 9.34), ' their')
((9.34, 9.56), ' haunted')
((9.56, 10.02), ' expression,')
((10.48, 10.58), ' a')
((10.58, 10.86), ' reflection')
((10.86, 11.1), ' of')
((11.1, 11.2), ' the')
((11.2, 11.44

In [25]:
from moviepy.video.tools.subtitles import SubtitlesClip

def generate_video(): 
    with open('samp.txt', 'r') as file:
        text = file.read()
    
    lst = text.split(' ')

    clips = []

    generator = lambda txt: TextClip(txt, 
                                fontsize=70, 
                                color='white', 
                                bg_color='none', 
                                font='Arial-Bold',
                                method = 'caption',
                                size = (1920, 1080))

    clips = [CompositeVideoClip([ImageClip('Z_1.jpg').set_duration(0.5).set_position('center')], size=(1920, 1080)) for _ in lst]

    subs = [((speech[0][k], speech[1][k]), speech[2][k]) for k in range(len(speech[0]))]
        

    subtitles = SubtitlesClip(subs, generator)

    final = concatenate_videoclips(clips)
    audio = AudioFileClip("samp.mp3")
    final = final.set_audio(audio)
    final_complete = CompositeVideoClip([final, subtitles.set_position(('center', 'bottom'))])
    final_complete.write_videofile("output0.mp4", fps=24, codec = 'libx264')

In [26]:
generate_video()

Moviepy - Building video output0.mp4.
MoviePy - Writing audio in output0TEMP_MPY_wvf_snd.mp3


                                                                      

MoviePy - Done.
Moviepy - Writing video output0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready output0.mp4
