In [10]:
import os
from dotenv import load_dotenv
from moviepy.editor import VideoFileClip, concatenate_videoclips, TextClip, ImageClip, CompositeVideoClip, AudioFileClip
import pyttsx3 as tts
import whisper
import requests
from elevenlabs.client import ElevenLabs
from elevenlabs import generate, save
import google.generativeai as genai
load_dotenv()

client = ElevenLabs(api_key=os.getenv('ELEVENLABS_API_KEY'))
gemini_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=gemini_key)

In [11]:
def generate_script(prompt, stories):
    model = genai.GenerativeModel('gemini-pro')
    default_prompt = """from now on return the output as regular text and the story should look
                    like it being narrated by someone, For an Example : There was once a boy fishing in the ocean
                    and he caught a fish, the fish was very big he was so happy, 
                    You have to write maximum 1000 characters I want a plain text no symbols just the script, 
                    also the script content must be in 50 seconds duration, dont include any symbols or special characters,"""
    for i in range(stories):
        story = model.generate_content(f"{default_prompt},{prompt}")
        with open(f'stories/story_{i}.txt', 'w') as f:
            f.write(story.text)

In [12]:
def get_images(text):
    BASE = 'https://api.unsplash.com/photos/'
    PAYLOAD = {
        'query': text,
        'client_id': os.getenv('UNSPLASH_ACCESS_KEY')
    }
    response = requests.get(BASE, params=PAYLOAD)
    data = response.json()
    return data

In [13]:
prompt = "Scary Mystery Puzzle"
stories = 5
generate_script(prompt, stories)

In [14]:
# def speak(text):
#     audio = generate(text,
#                      voice='Rachel',
#                      model = "eleven_multilingual_v2")
#     save(audio, 'samp.mp3')
    
def speak(text, file_name):
    engine = tts.init()
    engine.setProperty('rate', 150)
    engine.save_to_file(text, f'audio/{file_name}.mp3')
    engine.runAndWait()

for i in range(stories):
    with open(f'stories/story_{i}.txt', 'r') as f:
        text = f.readlines()
        speak(text, f'story_{i}')

In [15]:
def speech_to_text(audio_file):
    model = whisper.load_model("base")
    data = model.transcribe(audio_file, word_timestamps=True)

    start = [data['segments'][i]['words'][j]['start'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]
    end = [data['segments'][i]['words'][j]['end'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]
    text = [data['segments'][i]['words'][j]['word'] for i in range(len(data['segments'])) for j in range(len(data['segments'][i]['words']))]

    return [start, end, text]

In [7]:
for l in range(len(speech[0])):
    print(((speech[0][l], speech[1][l]), speech[2][l]))

NameError: name 'speech' is not defined

In [24]:
from moviepy.video.tools.subtitles import SubtitlesClip

def generate_video():
    for i in range(stories):
        with open(f'stories/story_{i}.txt', 'r') as file:
            print(f'Generating video for story {i}')
            text = file.read()
        
        lst = text.split(' ')

        #clips = []

        generator = lambda txt: TextClip(txt, 
                                    fontsize=70, 
                                    color='white', 
                                    bg_color='none', 
                                    font='Arial-Bold',
                                    method = 'caption',
                                    size = (1920, 1080))

        clips = [CompositeVideoClip([ImageClip('Z_1.jpg').set_duration(0.5).set_position('center')], size=(720, 1280)) for _ in lst]
        speech = speech_to_text(f'audio/story_{i}.mp3')
        print(f'Accessing speech for story {i}')
        subs = [((speech[0][k], speech[1][k]), speech[2][k]) for k in range(len(speech[0]))]

        subtitles = SubtitlesClip(subs, generator)

        final = concatenate_videoclips(clips, method="compose")
        audio = AudioFileClip(f"audio/story_{i}.mp3")
        final = final.set_audio(audio)
        final_complete = CompositeVideoClip([final, subtitles.set_position(('center', 'bottom'))])
        
        final_complete.write_videofile(f"output_{i}.mp4", fps=24, bitrate = '1000', audio_codec = 'aac', preset = 'fast', codec = 'h264_nvenc', verbose=False)

In [25]:
generate_video()

Generating video for story 0


In [None]:
import tortoise as t
from tortoise import api_fast

help(t)

Help on package tortoise:

NAME
    tortoise

PACKAGE CONTENTS
    api
    api_fast
    do_tts
    eval
    get_conditioning_latents
    is_this_from_tortoise
    models (package)
    read
    read_fast
    tts_stream
    utils (package)

FILE
    d:\projects\video-generator\myenv\lib\site-packages\tortoise_tts-3.0.0-py3.11.egg\tortoise\__init__.py




In [None]:
tts = api_fast.TextToSpeech(use_deepspeed=True, kv_cache=True, half=True)
pcm_audio = tts.tts_with_preset("your text here", preset='fast')

Some weights of the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

ModuleNotFoundError: No module named 'deepspeed'