# Dynamic Style Visualizer Code
The presented code works best as described on Google Colab, to make it work on local environments, we might need to setup NVIDIA CUDA drivers sepparately and add additional checks to make sure that our code recognizes the NVIDIA GPU's.


First install all the dependencies by running the cell below and it might ask you to restart the session, so please do so. After we have restarted the session, leaving the cell installing packages, run the remaining code cells to get the desired output.

In [None]:
# Install dependencies with version control
!pip install tensorflow==2.12.0 tensorflow-hub gradio diffusers transformers accelerate nltk sentence-transformers

To get the NLP aspect of the code working we needed proper implementation of NLTK data, and ensure that all of them downloads properly because while implementing the NLP aspect, many a times we got some error regarding missing package due to which the emotion recognition was unable to work.

In [None]:
# Configure NLTK properly
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
import os
nltk_data_path = '/content/nltk_data'
os.makedirs(nltk_data_path, exist_ok=True)
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)

# Create a custom sentence tokenizer that doesn't rely on punkt_tab
def custom_sent_tokenize(text):
    """Custom sentence tokenizer that uses PunktSentenceTokenizer directly"""
    # Initialize the tokenizer without loading from punkt_tab
    tokenizer = PunktSentenceTokenizer()
    return tokenizer.tokenize(text)


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from diffusers import StableDiffusionPipeline
import torch
import numpy as np
from PIL import Image
import gradio as gr
import re
import functools

# Import new NLP components
from sentence_transformers import SentenceTransformer
from transformers import pipeline


In [None]:
# Configure GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")


In [None]:
# Load models
@functools.lru_cache(maxsize=None)
def load_models():
    # Stable Diffusion
    sd_pipe = StableDiffusionPipeline.from_pretrained(
        "stabilityai/stable-diffusion-2-1",
        torch_dtype=torch.float16,
        safety_checker=None
    ).to(device)

    # Style Transfer
    hub_module = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2')
    stylize_fn = hub_module.signatures['serving_default']

    # NLP Models for Enhanced Style Detection
    style_encoder = SentenceTransformer('all-MiniLM-L6-v2')
    emotion_classifier = pipeline(
        "text-classification",
        model="j-hartmann/emotion-english-distilroberta-base",
        return_all_scores=True
    )

    return sd_pipe, stylize_fn, style_encoder, emotion_classifier

sd_pipe, stylize_fn, style_encoder, emotion_classifier = load_models()


In [None]:
# Style configuration - keep the existing mapping but enhance with more metadata
STYLE_MAPPING = {
    'dreamy': {
        'url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/1024px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg',
        'keywords': ['peaceful', 'golden', 'serene'],
        'description': "Serene, ethereal scenes with soft lighting"
    },
    'dark': {
        'url': 'https://upload.wikimedia.org/wikipedia/commons/c/c5/Edvard_Munch%2C_1893%2C_The_Scream%2C_oil%2C_tempera_and_pastel_on_cardboard%2C_91_x_73_cm%2C_National_Gallery_of_Norway.jpg',
        'keywords': ['dark', 'stormy', 'shadow'],
        'description': "Dramatic, ominous scenes with shadows"
    },
    'vibrant': {
        'url': 'https://upload.wikimedia.org/wikipedia/commons/b/b4/Vassily_Kandinsky%2C_1913_-_Composition_7.jpg',
        'keywords': ['bright', 'colorful', 'lively'],
        'description': "Colorful, energetic scenes with vivid details"
    }
}

# Emotion to style mapping
EMOTION_PRIORITY = {
    'neutral': 'dreamy',
    'fear': 'dark',
    'sadness': 'dark',
    'joy': 'vibrant',
    'surprise': 'dreamy',
    'anger': 'dark',
    'disgust': 'dark'
}


In [None]:
@functools.lru_cache(maxsize=None)
def load_style_image(style_url):
    """Load and preprocess style image to 256x256"""
    image_path = tf.keras.utils.get_file(os.path.basename(style_url)[-128:], style_url)
    img = tf.io.decode_image(tf.io.read_file(image_path), channels=3, dtype=tf.float32)[tf.newaxis, ...]
    img = tf.image.resize(img, (256, 256))
    return tf.nn.avg_pool(img, ksize=[3,3], strides=[1,1], padding='SAME')


In [None]:
def analyze_mood_enhanced(text):
    """Enhanced mood detection using both keyword analysis and ML models"""
    # Legacy keyword-based analysis
    text_lower = text.lower()
    keyword_scores = {mood: sum(1 for kw in data['keywords'] if kw in text_lower)
                     for mood, data in STYLE_MAPPING.items()}

    # Emotion-based analysis
    try:
        emotion_results = emotion_classifier(text)[0]
        dominant_emotion = max(emotion_results, key=lambda x: x['score'])
        emotion_style = EMOTION_PRIORITY.get(dominant_emotion['label'], 'vibrant')
        emotion_confidence = dominant_emotion['score']
    except Exception as e:
        print(f"Emotion analysis error: {e}")
        emotion_style = 'vibrant'
        emotion_confidence = 0.0

    # Semantic similarity analysis
    try:
        text_embedding = style_encoder.encode(text)
        style_embeddings = {
            style: style_encoder.encode(data['description'])
            for style, data in STYLE_MAPPING.items()
        }
        similarities = {
            style: float(np.dot(text_embedding, style_emb))
            for style, style_emb in style_embeddings.items()
        }
        semantic_style = max(similarities, key=similarities.get)
        semantic_confidence = similarities[semantic_style]
    except Exception as e:
        print(f"Semantic analysis error: {e}")
        semantic_style = 'vibrant'
        semantic_confidence = 0.0

    # Combine analyses with weights - prioritize emotion and semantic similarity over keywords
    combined_scores = {}
    for style in STYLE_MAPPING.keys():
        combined_scores[style] = (
            (0.2 * keyword_scores.get(style, 0)) +
            (0.4 * (1.0 if style == emotion_style else 0.0) * emotion_confidence) +
            (0.4 * similarities.get(style, 0.0))
        )

    best_mood = max(combined_scores, key=combined_scores.get)

    # Extract matched keywords for debugging
    matched_keywords = [kw for kw in STYLE_MAPPING[best_mood]['keywords'] if kw in text_lower]

    # Return additional analysis details for debugging
    analysis_details = {
        'keyword_match': keyword_scores,
        'emotion_analysis': {'style': emotion_style, 'confidence': emotion_confidence},
        'semantic_analysis': {'style': semantic_style, 'confidence': semantic_confidence},
        'combined_scores': combined_scores
    }

    return best_mood, matched_keywords, analysis_details


In [None]:
def generate_content_image(prompt):
    """Generate base image using Stable Diffusion with detailed prompts"""
    try:
        detailed_prompt = f"{prompt}, highly detailed, realistic, cinematic lighting"
        with torch.autocast(device.type):
            result = sd_pipe(
                detailed_prompt,
                guidance_scale=7.5,
                height=512,
                width=512,
                num_inference_steps=50
            )
        return result.images[0]
    except Exception as e:
        print(f"Image generation failed: {e}")
        return Image.new('RGB', (512, 512), color='gray')


In [None]:
# ========== IMAGE PROCESSING FUNCTIONS ==========
def process_scene(scene_text, style_image, mood, keywords, analysis_details=None):
    """Process individual scene through full pipeline with debugging details"""
    # Generate content image
    content_image = generate_content_image(scene_text)

    # Convert content image to TensorFlow tensor
    content_tensor = tf.image.resize(
        tf.keras.preprocessing.image.img_to_array(content_image)[tf.newaxis, ...] / 255.0,
        (256, 256)
    )

    # Apply Neural Style Transfer
    outputs = stylize_fn(
        placeholder=content_tensor,
        placeholder_1=style_image
    )

    # Convert styled output to PIL Image
    styled_array = (np.clip(outputs['output_0'].numpy()[0], 0, 1) * 255).astype(np.uint8)
    styled_image = Image.fromarray(styled_array)

    # Return all debugging details
    return {
        "source_image": content_image,
        "styled_image": styled_image,
        "style_applied": mood,
        "keywords_used": keywords,
        "scene_text": scene_text[:50]+"...",  # Truncated scene text for clarity
        "analysis_details": analysis_details  # Include the full analysis details
    }


In [None]:
def process_story(story_text):
    """Main processing pipeline with enhanced NLP analysis"""
    # Use our custom sentence tokenizer that doesn't rely on punkt_tab
    scenes = custom_sent_tokenize(story_text)
    outputs = []

    for scene in scenes:
        if len(scene.strip()) < 5:
            continue

        # Analyze mood with enhanced NLP approach
        mood, keywords, analysis_details = analyze_mood_enhanced(scene)

        # Load style image based on mood
        style_url = STYLE_MAPPING[mood]['url']
        style_image = load_style_image(style_url)

        # Process scene and collect all details
        scene_details = process_scene(scene, style_image, mood, keywords, analysis_details)
        outputs.append(scene_details)

    return outputs


In [None]:
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# 📖 Dynamic Story Visualizer with Enhanced NLP")

    with gr.Row():
        story_input = gr.Textbox(label="Your Story", placeholder="Once upon a time...", lines=5)
        generate_btn = gr.Button("Generate Visual Story 🎨", variant="primary")

    with gr.Row():
        source_gallery = gr.Gallery(label="Source Images", columns=3, object_fit="contain")
        styled_gallery = gr.Gallery(label="Stylized Images", columns=3, object_fit="contain")

    with gr.Row():
        style_info = gr.Textbox(label="Style Analysis Details", lines=10)

    status = gr.Textbox(label="Processing Status", visible=True)

    def wrapper_fn(story_text):
        try:
            yield [[], [], "", "Starting processing..."]  # Empty galleries initially

            scenes_details = process_story(story_text)

            source_images = []
            styled_images = []
            style_details = []

            for detail in scenes_details:
                source_images.append((detail["source_image"], f"Source: {detail['scene_text']}"))
                styled_images.append((detail["styled_image"], f"Styled: {detail['scene_text']}"))

                # Enhanced style details with NLP analysis
                analysis = detail.get('analysis_details', {})
                style_details.append(
                    f"Scene: {detail['scene_text']}\n"
                    f"Style Applied: {detail['style_applied']}\n"
                    f"Keywords Used: {', '.join(detail['keywords_used'])}\n"
                    f"Emotion Analysis: {analysis.get('emotion_analysis', {})}\n"
                    f"Semantic Score: {analysis.get('semantic_analysis', {})}\n"
                    f"---"
                )

            yield [source_images, styled_images, "\n\n".join(style_details), "Processing complete!"]

        except Exception as e:
            yield [[], [], "", f"❌ Error: {str(e)}"]
            raise

    generate_btn.click(
        fn=wrapper_fn,
        inputs=story_input,
        outputs=[source_gallery, styled_gallery, style_info, status]
    )

app.launch(server_name="0.0.0.0", share=True, debug=True)
