In [None]:
!pip install -q google-generativeai
!pip install -q git+https://github.com/huggingface/parler-tts.git
!pip install -q soundfile transformers accelerate

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m130.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
OUTPUT_DIR = "/content/drive/MyDrive/MagicNarrate_Audio"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
import os
import google.generativeai as genai

# ---- Gemini API Key ----
os.environ["GEMINI_API_KEY"] = "AIzaSyAtmlW6AeCtazbi_bauY1aiU5MHbf6kHtg"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# ---- Load Gemini model ----
llm = genai.GenerativeModel("models/gemini-2.5-flash")

def generate_story(emotion, genre, sentence):
    prompt = f"""
You are a creative children's storyteller.

Emotion: {emotion}
Genre: {genre}

Task:
Create a short story (80–100 words) suitable for children.
The story must strongly reflect the given emotion through:
- word choice
- sentence rhythm
- atmosphere
- character reactions

IMPORTANT RULES:
- Do NOT use sound effects or onomatopoeia
- Avoid exaggerated punctuation (!!!, ???)
- Use calm, natural narrative sentences suitable for text-to-speech systems

Starting idea:
"{sentence}"

End the story on an emotionally meaningful note.
"""
    response = llm.generate_content(prompt)
    return response.text


import torch
import soundfile as sf
import re
import numpy as np

from transformers import AutoTokenizer, set_seed
from parler_tts import ParlerTTSForConditionalGeneration
from IPython.display import Audio

# ---- Device ----
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---- Load TTS model ----
model_name = "parler-tts/parler-tts-mini-expresso"
tts_model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
tts_tokenizer = AutoTokenizer.from_pretrained(model_name)


def split_story(text, max_sentences=2):
    """
    Split story into chunks of N sentences to avoid TTS failure
    """
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    chunks = []

    for i in range(0, len(sentences), max_sentences):
        chunk = " ".join(sentences[i:i + max_sentences])
        if chunk:
            chunks.append(chunk)

    return chunks



def speak_story(story_text, emotion, speaker="Lea"):
    description = f"""
    {speaker} speaks slowly in a {emotion} tone
    with emphasis and high quality audio.
    """

    input_ids = tts_tokenizer(
        description,
        return_tensors="pt",
        truncation=True
    ).input_ids.to(device)

    story_chunks = split_story(story_text, max_sentences=2)
    all_audio = []

    set_seed(42)

    for idx, chunk in enumerate(story_chunks):
        print(f"🔊 Generating audio for part {idx+1}/{len(story_chunks)}")

        prompt_input_ids = tts_tokenizer(
            chunk,
            return_tensors="pt",
            truncation=True
        ).input_ids.to(device)

        with torch.no_grad():
            audio = tts_model.generate(
                input_ids=input_ids,
                prompt_input_ids=prompt_input_ids
            )

        audio_np = audio.cpu().numpy().squeeze()
        all_audio.append(audio_np)

        # Pause between chunks (0.4 sec)
        pause = np.zeros(int(0.4 * tts_model.config.sampling_rate))
        all_audio.append(pause)

    # Concatenate all audio
    final_audio = np.concatenate(all_audio)

    # ---- SAVE TO GOOGLE DRIVE ----
    output_path = os.path.join(OUTPUT_DIR, "Story.wav")

    sf.write(
        output_path,
        final_audio,
        tts_model.config.sampling_rate
    )

    print(f"✅ Audio saved to Google Drive: {output_path}")

    return Audio(output_path)



# ---- User Inputs ----
emotion = "happy"
genre = "adventure"
sentence = "a dog running towards a river"

# ---- Generate story ----
story = generate_story(emotion, genre, sentence)

print("=== GENERATED STORY ===\n")
print(story)

# ---- Convert story to speech ----
audio_output = speak_story(story, emotion, speaker="Lea")
audio_output

Using device: cuda


  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length"

=== GENERATED STORY ===

Waffles, a bouncy brown dog, wiggled with pure joy. His happy tail wagged like a cheerful flag as he zoomed across the green meadow. Sunlight sparkled on his fur, lighting his way as he raced towards the shimmering river ahead. He loved the feel of the soft grass beneath his paws. When he reached the water's edge, crystal clear and inviting, Waffles plunged right in! He paddled with gleeful energy, sending delightful little splashes high into the air. The cool water felt wonderfully refreshing. He knew this river adventure was a cherished moment, filling his little doggy heart with perfect happiness.
🔊 Generating audio for part 1/4
🔊 Generating audio for part 2/4
🔊 Generating audio for part 3/4
🔊 Generating audio for part 4/4
✅ Audio saved to Google Drive: /content/drive/MyDrive/MagicNarrate_Audio/Story.wav
