# Write a program on Generative AI that perform a following task
 1. take a problem statement from user
 2. create a short story/ news/ writer / research article etc (base on the mention task by the user)
 3. also create set of images to support step 2
 4. creates visual (audio + video) using steps 2 and 3

In [None]:
!pip install torch transformers diffusers accelerate moviepy soundfile
!pip install pywhatkit python-dotenv beautifulsoup4 rich groq requests keyboard wikipedia

Collecting pywhatkit
  Downloading pywhatkit-5.4-py3-none-any.whl.metadata (5.5 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting keyboard
  Downloading keyboard-0.13.5-py3-none-any.whl.metadata (4.0 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyautogui (from pywhatkit)
  Downloading PyAutoGUI-0.9.54.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting python3-Xlib (from pyautogui->pywhatkit)
  Downloading python3-xlib-0.15.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.

In [None]:
from bs4 import BeautifulSoup
from rich import print
from groq import Groq
import os
import webbrowser
import subprocess
import requests
import keyboard
import asyncio
import wikipedia
import torch
from diffusers import StableDiffusionPipeline
from transformers import pipeline, BarkModel, AutoProcessor
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
from scipy.io.wavfile import write as write_wav
import numpy as np

# --- Configuration ---
# Replace this placeholder with your actual Groq API key
GROQ_API_KEY = "your_API_Key"
groq_client = Groq(api_key=GROQ_API_KEY)

# --- Hugging Face API Key for Image/Audio Models (if needed) ---
# For some models, you might need a Hugging Face API key.
# For simplicity and to avoid over-complicating this example,
# we'll assume the public models used here don't require one,
# but it's good practice to be aware of this.
HUGGINGFACE_API_KEY = "your_API_Key"

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = "generated_content"
os.makedirs(OUTPUT_DIR, exist_ok=True)


# --- Text Generation Function ---
def contentWrite(query):
    try:
        response = groq_client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[{"role": "user", "content": query}],
            temperature=0.7,
            max_tokens=1000,
            top_p=0.95,
            frequency_penalty=0.0,
            presence_penalty=0.0
        )
        result_content = response.choices[0].message.content.strip()

        if not result_content:
            return "No content generated."

        print(result_content)
        return result_content

    except Exception as e:
        return f"Error: {e}"


# --- Wikipedia Summary Function ---
def WikiSummary(topic):
    try:
        summary = wikipedia.summary(topic, sentences=2)
        print(f"[bold magenta]Wikipedia Summary:[/bold magenta] {summary}")
        return summary
    except Exception as e:
        print(f"[bold red]Wikipedia Error:[/bold red] {e}")
        return f"Error: {e}"


# --- New Functions for Image, Audio, and Video Generation ---
def generate_images_from_text(text: str) -> list:
    """Generates a series of images from key sentences in the text."""
    sentences = [s.strip() for s in text.split('.') if s.strip()]

    # Initialize the Stable Diffusion pipeline
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16 if device == 'cuda' else torch.float32
    ).to(device)

    image_paths = []
    print("\n--- Generating Images ---")
    for i, sentence in enumerate(sentences):
        # Limit to 4 images for a quick demo to save time and resources
        if i >= 4:
            break

        # Create a detailed prompt for better image quality
        prompt = f"Cinematic still of a scene from a story: {sentence}, highly detailed, professional photography, 8k"

        # Generate the image
        image = pipe(prompt).images[0]

        # Save the image to the output directory
        image_path = os.path.join(OUTPUT_DIR, f"image_{i}.png")
        image.save(image_path)
        image_paths.append(image_path)
        print(f"Saved image {i+1} to {image_path}")

    return image_paths

def generate_audio_from_text(text: str) -> str:
    """Converts text into a speech audio file using a text-to-speech model."""
    try:
        processor = AutoProcessor.from_pretrained("suno/bark")
        model = BarkModel.from_pretrained("suno/bark")
        if device == 'cuda':
            model = model.to(device)

        voice_preset = "v2/en_speaker_6"
        inputs = processor(text, voice_preset=voice_preset)

        with torch.no_grad():
            audio_array = model.generate(**inputs, do_sample=True)

        audio_path = os.path.join(OUTPUT_DIR, "audio.wav")
        # Ensure audio data is a single channel for a standard .wav file
        audio_data = audio_array.cpu().numpy().squeeze()
        write_wav(audio_path, model.generation_config.sample_rate, audio_data)

        print("\n--- Audio Generated ---")
        print(f"Saved audio to {audio_path}")
        return audio_path
    except Exception as e:
        print(f"Audio generation failed: {e}")
        return None

def create_video_from_assets(image_paths: list, audio_path: str) -> str:
    """Combines images and an audio file into a video."""
    if not image_paths or not audio_path:
        print("Video creation skipped due to missing images or audio.")
        return ""

    try:
        audio_clip = AudioFileClip(audio_path)
        duration_per_image = audio_clip.duration / len(image_paths)

        # Create image clips, each with a duration that matches a segment of the audio
        image_clips = [ImageSequenceClip([path], durations=[duration_per_image]) for path in image_paths]

        # Concatenate image clips into a single video clip
        video_clip = concatenate_videoclips(image_clips)

        # Set the audio of the video clip
        final_video_clip = video_clip.set_audio(audio_clip)

        video_path = os.path.join(OUTPUT_DIR, "final_video.mp4")
        final_video_clip.write_videofile(video_path, fps=24, codec='libx264')

        print("\n--- Video Created ---")
        print(f"Final video saved to {video_path}")
        return video_path
    except Exception as e:
        print(f"Video creation failed: {e}")
        return None


# --- Main Program Execution ---
if __name__ == "__main__":
    prompt = input("Problem statement: ")
    prompt_type = input("Enter a type (story/ news/ writer / research /article etc): ")

    # 1. Generate text content
    content = contentWrite(f"write a {prompt_type} on {prompt}")

    # 2. Add Wikipedia Summary if it's a 'research' type
    if prompt_type == "research":
        WikiSummary(prompt)

    # 3. Check if content was generated before proceeding
    if content and content != "No content generated.":
        # 4. Generate images from the content
        image_files = generate_images_from_text(content)

        # 5. Generate audio from the content
        audio_file = generate_audio_from_text(content)

        # 6. Create the final video
        create_video_from_assets(image_files, audio_file)

Problem statement: bloody mary
Enter a type (story/ news/ writer / research /article etc): story


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

speaker_embeddings_path.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

en_speaker_6_semantic_prompt.npy:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

en_speaker_6_coarse_prompt.npy:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

en_speaker_6_fine_prompt.npy:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Moviepy - Building video generated_content/final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video generated_content/final_video.mp4





Moviepy - Done !
Moviepy - video ready generated_content/final_video.mp4
