In [1]:
!pip install google-cloud-texttospeech pydub numpy



In [2]:
!pip install transformers pydub google-cloud-texttospeech



In [3]:
# prompt: no module named fitz

!pip install pymupdf openai moviepy




In [4]:
!pip install openai moviepy
!pip install requests



In [5]:
import fitz  # PDF handling
import re
import os
import numpy as np
import IPython.display as ipd
from pydub import AudioSegment
from google.cloud import texttospeech
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import openai
import moviepy.editor as mpy
import requests  # For downloading images


In [6]:
def read_text_file(file_path):
    """Reads a text file and returns its content."""
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def read_pdf_file(file_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(file_path)
    text = "\n".join([page.get_text() for page in doc])
    return text


In [7]:
def preprocess_text(text):
    """Cleans text and splits it into sentences."""
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences


In [8]:
!pip install transformers torch




In [9]:
from transformers import pipeline

# Load a pre-trained BERT-based emotion classifier
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", top_k=1)

def detect_emotion(text):
    """Detects emotion from text using a better transformer model."""
    result = emotion_classifier(text[:512])  # Process first 512 tokens
    emotion = result[0][0]['label']  # Get top emotion
    return emotion.lower()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Device set to use cpu


In [10]:
from google.cloud import texttospeech
from google.oauth2 import service_account

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/kukufm-f6e59d5c8710.json"  # Google Cloud Key
credentials = service_account.Credentials.from_service_account_file("/content/kukufm-f6e59d5c8710.json")
client = texttospeech.TextToSpeechClient(credentials=credentials)


def narrator_tts(text, output_file="speech.mp3"):
    """Convert text to speech using Google WaveNet for the narrator."""

    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-IN",
        name="en-IN-Chirp3-HD-Zephyr",  # Choose a narrator voice
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
    )

    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

    with open(output_file, "wb") as out:
        out.write(response.audio_content)

    print(f"✅ Narration saved as {output_file}")
    return output_file


def charactor_tts(text, output_file="speech.mp3", emotion="neutral"):
    """Convert text to speech using Google WaveNet for characters in the story."""

    # SSML adjustments for stronger emotion variation
    emotion_settings = {
        "joy": {"rate": "medium", "pitch": "+15%", "volume_gain_db": 2, "speaking_rate": 1.2},
        "sadness": {"rate": "slow", "pitch": "-10%", "volume_gain_db": -2, "speaking_rate": 1},
        "anger": {"rate": "medium", "pitch": "+20%", "volume_gain_db": 4}, "speaking_rate": 1.2,
        "fear": {"rate": "medium", "pitch": "-15%", "volume_gain_db": 0, "speaking_rate": 0.9},
        "neutral": {"rate": "normal", "pitch": "0%", "volume_gain_db": 0, "speaking_rate": 1}
    }

    settings = emotion_settings.get(emotion, emotion_settings["neutral"])

    ssml_text = f"""
    <speak>
        <prosody rate="{settings['rate']}" pitch="{settings['pitch']}">
            {text}
        </prosody>
    </speak>
    """

    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-IN",
        name="en-IN-Chirp3-HD-Zephyr",  # Indian English Accent
        ssml_gender=texttospeech.SsmlVoiceGender.MALE
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        volume_gain_db=settings["volume_gain_db"]
    )

    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

    with open(output_file, "wb") as out:
        out.write(response.audio_content)

    print(f"✅ Speech saved as {output_file} with emotion {emotion}")
    return output_file

In [11]:
from pydub import AudioSegment

def emotion_to_speech(sentence, emotion, prev_emotion=None, bg_music=None, is_narration=False):
    """Modify speech properties based on detected emotion and correctly mix background music."""

    # Generate speech using Google WaveNet
    if is_narration:
        speech_file = narrator_tts(sentence, "speech.mp3")
    else:
        speech_file = charactor_tts(sentence, "speech.mp3", emotion)
    speech_segment = AudioSegment.from_file(speech_file)

    # Background music handling
    bg_music_files = {
        "joy": "/content/bg_joy.mp3",
        "sadness": "/content/bg_sadness.mp3",
        "anger": "/content/bg_anger.mp3",
        "fear": "/content/bg_fear.mp3",
        "neutral": "/content/bg_neutral.mp3"
    }

    # Ensure the same background music continues if emotion is unchanged
    if prev_emotion == emotion and bg_music is not None:
        print(f"🎵 Continuing background music for {emotion}")
    else:
        bg_file = bg_music_files.get(emotion, "/content/bg_neutral.mp3")
        if os.path.exists(bg_file):
            bg_music = AudioSegment.from_file(bg_file).set_frame_rate(speech_segment.frame_rate)
            bg_music = bg_music - 15  # Lower background music volume
        else:
            bg_music = AudioSegment.silent(duration=len(speech_segment))

    # Extend background music if needed
    if len(bg_music) < len(speech_segment):
        bg_music = bg_music * (len(speech_segment) // len(bg_music) + 1)
    bg_music = bg_music[:len(speech_segment)]  # Trim to speech length

    # Overlay speech on background music
    final_audio = bg_music.overlay(speech_segment)

    return final_audio, emotion, bg_music  # Return background music for reuse

In [12]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
from pathlib import Path
import openai
import os
import uuid

In [13]:
import openai
import requests
from PIL import Image
from io import BytesIO
import os

openai.api_key = "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"  # 🔐 Replace with your actual key



def generate_image_dalle(prompt_text, image_save_path):
    response = openai.images.generate(
        model="dall-e-3",
        prompt=prompt_text,
        size="1024x1024",
        quality="standard",
        n=1,
    )

    image_url = response.data[0].url
    image_data = requests.get(image_url).content
    image = Image.open(BytesIO(image_data))
    image.save(image_save_path)
    return image_save_path



In [14]:
from pydub import AudioSegment
import os

def emotion_to_speech(sentence, emotion, prev_emotion=None, bg_music=None, is_narration=False, save_path="speech.mp3"):
    if is_narration:
        speech_file = narrator_tts(sentence, save_path)
    else:
        speech_file = charactor_tts(sentence, save_path, emotion)
    speech_segment = AudioSegment.from_file(speech_file)

    bg_music_files = {
        "joy": "/content/bg_joy.mp3",
        "sadness": "/content/bg_sadness.mp3",
        "anger": "/content/bg_anger.mp3",
        "fear": "/content/bg_fear.mp3",
        "neutral": "/content/bg_neutral.mp3"
    }

    if prev_emotion == emotion and bg_music is not None:
        print(f"🎵 Continuing background music for {emotion}")
    else:
        bg_file = bg_music_files.get(emotion, "/content/bg_neutral.mp3")
        if os.path.exists(bg_file):
            bg_music = AudioSegment.from_file(bg_file).set_frame_rate(speech_segment.frame_rate) - 15
        else:
            bg_music = AudioSegment.silent(duration=len(speech_segment))

    if len(bg_music) < len(speech_segment):
        bg_music *= (len(speech_segment) // len(bg_music) + 1)

    bg_music = bg_music[:len(speech_segment)]
    final_audio = bg_music.overlay(speech_segment)

    final_audio.export(save_path, format="mp3")
    return save_path, emotion, bg_music


In [15]:
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips

def merge_audio_with_images(image_paths, audio_paths, output_video="final_story.mp4"):
    video_clips = []
    for img_path, audio_path in zip(image_paths, audio_paths):
        audio = AudioFileClip(audio_path)
        img_clip = ImageClip(img_path).set_duration(audio.duration)
        img_clip = img_clip.set_audio(audio)
        video_clips.append(img_clip)

    final_video = concatenate_videoclips(video_clips, method="compose")
    final_video.write_videofile(output_video, fps=24)


In [None]:
from pydub import AudioSegment
from IPython.display import Video
import os

file_path = "/content/st4.txt"  # Change this to your file

# STEP 1: Load and preprocess
if file_path.endswith(".txt"):
    input_text = read_text_file(file_path)
elif file_path.endswith(".pdf"):
    input_text = read_pdf_file(file_path)
else:
    raise ValueError("Unsupported file format. Use .txt or .pdf")

sentences = preprocess_text(input_text)

image_paths = []
audio_paths = []
subtitle_entries = []

prev_emotion = None
bg_music = None
current_time = 0.0  # Start subtitle timing from 0

# STEP 2: Process every 3-sentence group
for idx in range(0, len(sentences), 3):
    sentence_group = sentences[idx:idx+3]
    combined_sentence = " ".join(sentence_group).strip()

    print(f"📖 Processing Sentence Group {idx//3}: {combined_sentence}")

    # 🎨 Image generation
    image_path = f"/content/image_{idx//3}.png"
    generate_image_dalle(combined_sentence, image_path)
    image_paths.append(image_path)

    # 🔊 Audio generation
    emotion = detect_emotion(combined_sentence)
    audio_path = f"/content/audio_{idx//3}.mp3"
    emotion_to_speech(
        combined_sentence, emotion, prev_emotion, bg_music,
        is_narration=not (combined_sentence.startswith('"') and combined_sentence.endswith('"')),
        save_path=audio_path
    )
    audio_paths.append(audio_path)
    prev_emotion = emotion

    # STEP 3: Subtitles (per sentence timing)
    audio = AudioSegment.from_file(audio_path)
    total_duration = audio.duration_seconds

    total_chars = sum(len(s) for s in sentence_group)
    for sentence in sentence_group:
        proportion = len(sentence) / total_chars
        sentence_duration = proportion * total_duration
        subtitle_entries.append({
            "start": current_time,
            "end": current_time + sentence_duration,
            "text": sentence
        })
        current_time += sentence_duration

# STEP 4: Generate subtitle file (.ass format)
ass_path = "/content/subtitles.ass"
with open(ass_path, "w") as f:
    f.write("""[Script Info]
ScriptType: v4.00+
PlayResX: 1280
PlayResY: 720

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut,
        ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,32,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,
       100,100,0,0,1,2,0,2,10,10,40,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
""")
    for entry in subtitle_entries:
        def ass_time(t):
            h = int(t // 3600)
            m = int((t % 3600) // 60)
            s = int(t % 60)
            cs = int((t - int(t)) * 100)  # centiseconds
            return f"{h:01}:{m:02}:{s:02}.{cs:02}"

        f.write(f"Dialogue: 0,{ass_time(entry['start'])},{ass_time(entry['end'])},Default,,0,0,0,,{entry['text']}\n")

# STEP 5: Merge images and audios into video (function you already have)
merge_audio_with_images(image_paths, audio_paths, output_video="/content/final_emotion_story.mp4")

# STEP 6: Burn subtitles onto video
!ffmpeg -y -i "/content/final_emotion_story.mp4" -vf "ass={ass_path}" "/content/final_emotion_story_subtitled.mp4"

# STEP 7: Show final video with subtitles
Video("/content/final_emotion_story_subtitled.mp4", embed=True)


In [None]:
from IPython.display import Video

# 🎥 Display the final video in notebook
Video("/content/final_emotion_story.mp4", embed=True)
