<a href="https://colab.research.google.com/github/RashidNoor42/AI-Story-to-Video-Generator-using-Hugging-Face-Models/blob/main/StoryScape_A_Journey_in_Motion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
AI Video Generator - Creates videos from AI-generated stories

INSTALLATION REQUIREMENTS:
pip install torch transformers diffusers torchaudio moviepy pillow numpy gtts

For better TTS (optional):
- Ubuntu/Debian: sudo apt-get install espeak espeak-data
- macOS: brew install espeak
- Windows: Download from http://espeak.sourceforge.net/

Then: pip install pyttsx3
"""
# Import necessary libraries
from huggingface_hub import login
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from diffusers import StableDiffusionPipeline
from transformers import pipeline
from PIL import Image
import requests
from io import BytesIO
import moviepy.editor as mpy
import torchaudio
from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
from speechbrain.inference.vocoders import HIFIGAN
import moviepy.editor as mpy

import torchaudio
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from diffusers import StableDiffusionPipeline
import torch
import moviepy.editor as mpy
from PIL import Image
import numpy as np
import os

# For TTS, we'll use multiple fallback options
TTS_METHOD = None
TTS_AVAILABLE = False
# Step 1: Authenticate with Hugging Face
login(token="HUGGING_FACE_TOKEN")  # Using your Hugging Face token

In [None]:


# Try gTTS first (more reliable)
try:
    from gtts import gTTS
    TTS_METHOD = "gtts"
    TTS_AVAILABLE = True
    print("Using gTTS for text-to-speech")
except ImportError:
    try:
        import pyttsx3
        TTS_METHOD = "pyttsx3"
        TTS_AVAILABLE = True
        print("Using pyttsx3 for text-to-speech")
    except ImportError:
        print("Warning: No TTS library available. Install gtts or pyttsx3")
        TTS_AVAILABLE = False

# Load GPT2 for text generation
def load_gpt2():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    # Set pad_token to eos_token to avoid warnings
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer, model

def generate_text(prompt, tokenizer, model, max_length=200):
    inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)

    # Set attention mask
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

# Image Generation: Use Stable Diffusion
def load_stable_diffusion():
    try:
        stable_diff_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        if torch.cuda.is_available():
            stable_diff_pipe.to("cuda")
        return stable_diff_pipe
    except Exception as e:
        print(f"Error loading Stable Diffusion: {e}")
        print("Using placeholder images instead")
        return None

def generate_image_from_text(text_prompt, stable_diff_pipe):
    if stable_diff_pipe is None:
        # Create a placeholder image if Stable Diffusion is not available
        img = Image.new('RGB', (512, 512), color=(np.random.randint(0, 255),
                                                  np.random.randint(0, 255),
                                                  np.random.randint(0, 255)))
        return img

    try:
        image = stable_diff_pipe(text_prompt).images[0]
        return image
    except Exception as e:
        print(f"Error generating image: {e}")
        # Return placeholder image
        img = Image.new('RGB', (512, 512), color=(128, 128, 128))
        return img

# TTS: Using multiple fallback approaches
def generate_audio_from_text(text, filename="audio.wav"):
    if not TTS_AVAILABLE:
        print("TTS not available. Creating silent video.")
        return False

    try:
        if TTS_METHOD == "gtts":
            # Using gTTS (Google Text-to-Speech) - more reliable
            from gtts import gTTS
            import io
            # Truncate text if too long for gTTS
            if len(text) > 1000:
                text = text[:1000] + "..."
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(filename)

        elif TTS_METHOD == "pyttsx3":
            # Using pyttsx3 - requires eSpeak installation
            import pyttsx3
            engine = pyttsx3.init()
            # Try to set properties to avoid errors
            try:
                engine.setProperty('rate', 150)    # Speed of speech
                engine.setProperty('volume', 0.9)  # Volume level (0.0 to 1.0)
            except:
                pass
            engine.save_to_file(text, filename)
            engine.runAndWait()

        # Verify file was created
        if os.path.exists(filename) and os.path.getsize(filename) > 0:
            print(f"Audio saved as {filename}")
            return True
        else:
            print("Audio file was not created successfully")
            return False

    except Exception as e:
        print(f"Error generating audio: {e}")
        print("Continuing without audio...")
        return False

# Create a better story prompt
def create_better_story(tokenizer, model):
    story_prompt = "Once upon a time, in a mystical kingdom, there lived a brave knight named Sir Arthur. One day, he embarked on a dangerous quest to"
    story = generate_text(story_prompt, tokenizer, model, max_length=300)

    # Clean up the story by taking only the part that makes sense
    sentences = story.split('.')
    clean_sentences = []
    for sentence in sentences[:8]:  # Take first 8 sentences
        if len(sentence.strip()) > 10:  # Only include meaningful sentences
            clean_sentences.append(sentence.strip())

    return '. '.join(clean_sentences) + '.'

# Create dynamic prompts based on the story
def create_dynamic_prompts(story):
    prompts = []
    sentences = story.split('.')

    # Create more descriptive prompts
    base_prompts = [
        "A brave medieval knight in shining armor standing in a castle courtyard",
        "A mystical fantasy kingdom with mountains and forests in the background",
        "A knight on horseback riding through an enchanted forest",
        "A dark cave entrance with mysterious glowing crystals",
        "A knight facing a fierce dragon in an epic battle",
        "A triumphant knight returning to his castle at sunset"
    ]

    # If we have story sentences, try to incorporate them
    for i, sentence in enumerate(sentences[:6]):
        if len(sentence.strip()) > 5:
            prompt = f"Fantasy art style: {sentence.strip()}, medieval setting, cinematic lighting"
            prompts.append(prompt)
        elif i < len(base_prompts):
            prompts.append(base_prompts[i])

    # Fill remaining slots with base prompts if needed
    while len(prompts) < 6:
        prompts.append(base_prompts[len(prompts)])

    return prompts[:6]  # Return exactly 6 prompts

# Create video from images and audio
def create_video(images, audio_filename, output_filename="output_video.mp4"):
    try:
        clips = []

        # Save images temporarily and create clips
        temp_image_files = []
        for i, img in enumerate(images):
            temp_filename = f"temp_img_{i}.png"
            img.save(temp_filename)
            temp_image_files.append(temp_filename)

            img_clip = mpy.ImageClip(temp_filename)
            img_clip = img_clip.set_duration(5)  # Each image for 5 seconds
            clips.append(img_clip)

        # Concatenate all image clips
        print("Creating video sequence...")
        video = mpy.concatenate_videoclips(clips, method="compose")

        # Add audio if available
        if audio_filename and os.path.exists(audio_filename) and os.path.getsize(audio_filename) > 0:
            try:
                print("Adding audio to video...")
                audio = mpy.AudioFileClip(audio_filename)
                # Match video length to audio or vice versa
                video_duration = min(30, video.duration)
                if audio.duration > video_duration:
                    audio = audio.subclip(0, video_duration)
                elif video.duration > audio.duration:
                    video = video.subclip(0, audio.duration)
                video = video.set_audio(audio)
                print("Audio added successfully")
            except Exception as e:
                print(f"Error adding audio: {e}")
                print("Creating video without audio...")
        else:
            print("No audio file available, creating silent video...")

        # Ensure video doesn't exceed 30 seconds
        if video.duration > 30:
            video = video.subclip(0, 30)

        # Write video file with more error handling
        print(f"Writing video file: {output_filename}")
        video.write_videofile(
            output_filename,
            fps=24,
            verbose=False,
            logger=None,
            audio_codec='aac' if video.audio else None
        )

        # Clean up temporary files
        for temp_file in temp_image_files:
            try:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
            except:
                pass

        print(f"Video created successfully: {output_filename}")
        return True

    except Exception as e:
        print(f"Error creating video: {e}")
        # Try to clean up temp files even if video creation failed
        try:
            for temp_file in temp_image_files:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
        except:
            pass
        return False

# Main execution
def main():
    print("Loading models...")

    # Load GPT-2 for text generation
    tokenizer, model = load_gpt2()

    # Generate story
    print("Generating story...")
    story = create_better_story(tokenizer, model)
    print("Generated Story:")
    print(story)
    print("\n" + "="*50 + "\n")

    # Create dynamic prompts
    prompts = create_dynamic_prompts(story)
    print("Generated Prompts for Image Generation:")
    for i, prompt in enumerate(prompts, 1):
        print(f"{i}. {prompt}")
    print("\n" + "="*50 + "\n")

    # Load Stable Diffusion
    print("Loading Stable Diffusion...")
    stable_diff_pipe = load_stable_diffusion()

    # Generate images
    print("Generating images...")
    images = []
    for i, prompt in enumerate(prompts):
        print(f"Generating image {i+1}/6...")
        image = generate_image_from_text(prompt, stable_diff_pipe)
        images.append(image)

    # Generate audio
    print("Generating audio...")
    audio_filename = "story_audio.wav"
    audio_success = generate_audio_from_text(story, audio_filename)

    # Create video (with or without audio)
    print("Creating video...")
    video_success = create_video(images, audio_filename if audio_success else None)

    if video_success:
        print("Process completed successfully!")
    else:
        print("Process completed with errors. Check the output above.")

if __name__ == "__main__":
    main()

In [None]:
import torch
print(torch.cuda.is_available())  # This should return True if GPU is available


In [None]:
# pip install speechbrain

# pip install transformers torch

# pip install speechbrain transformers torchaudio moviepy diffusers

# pip install diffusers torch

# pip install torch transformers diffusers torchaudio moviepy pillow numpy


# pip install gtts     # Google TTS (requires internet)

# pip install pyttsx3