Here, I created a MobileNetV2 style transfer pipeline from scratch, implementing the multi-resolution training schedule, optimisation steps, and unique loss functions. For stability, pretrained ImageNet weights were used to initialise the backbone extractor; nonetheless, I designed and built the pipeline on my own. In order to test MobileNet's effectiveness and guide its inclusion in the finished integrated 3-in-1 prototype, I further applied this model on a dataset to produce a portfolio of stylised outcomes.

In [None]:
# Install gdown to download the dataset folder (COCO for content + WikiArt for style images) from Google Drive
!pip install -q gdown

# Shared Google Drive folder link
url = "https://drive.google.com/drive/folders/1xJy4FXcBIHKnjO5t_m8PJsevBJQ2_6G-?usp=sharing"

# Download the shared Google Drive folder into a local directory named "dataset"
!gdown --folder "$url" -O dataset

# Verify the folder contents after download
import os
for root, dirs, files in os.walk("dataset"):
    # Count folder depth based on subfolders inside folder
    level = root.replace("dataset", "").count(os.sep)
    # Create indent spaces depending on folder depth
    indent = " " * 2 * level
    # Print the current folder name
    print(f"{indent}{os.path.basename(root)}/")
    # Create indentation for files inside the folder
    subindent = " " * 2 * (level + 1)
    # Loop through each file inside the current folder and print with indentation
    for f in files:
        print(f"{subindent}{f}")


In [None]:
#Confirm and check contents inside the downloaded folder
import os

# Define path to the dataset folder
folder_path = "/content/dataset"


# Walk through the folder and its subfolders (verify contents once again)
for root, dirs, files in os.walk(folder_path):
    # indentation based on depth of the folder
    level = root.replace(folder_path, "").count(os.sep)
    # Create indent spaces depending on folder depth
    indent = " " * 2 * level
    # Print the current folder name
    print(f"{indent}{os.path.basename(root)}/")
     # Create indentation for files inside the folder
    subindent = " " * 2 * (level + 1)
     # Loop through each file inside the current folder and print with indentation
    for f in files:
        print(f"{subindent}{f}")


In [None]:
# Import core libraries for deep learning and plotting
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os, random, time
from tensorflow.keras.applications import mobilenet_v2
from tensorflow.keras.models import Model

# Load and resize an image for display (hi-res, keeps aspect ratio)
def load_img_display(path, target_max_side=768):
    # Read image from file path
    img = tf.io.read_file(path)
    # Decode image into 3 channels (disable animations for GIFs/webp)
    img = tf.image.decode_image(img, channels=3, expand_animations=False)
    # Convert pixel values to float in [0,1]
    img = tf.image.convert_image_dtype(img, tf.float32)
    # Extract original height and width of image
    h = tf.cast(tf.shape(img)[0], tf.float32)
    w = tf.cast(tf.shape(img)[1], tf.float32)
    # Compute scaling factor so longest side = target_max_side
    scale = target_max_side / tf.maximum(h, w)
    new_h = tf.cast(tf.round(h * scale), tf.int32)
    new_w = tf.cast(tf.round(w * scale), tf.int32)
    img = tf.image.resize(img, (new_h, new_w), method='bicubic', antialias=True)
    return img

# Display an image using matplotlib
def show_display(img, title=None, figsize=(7,7), dpi=150):
    # Remove batch dimension if present
    #so the shape becomes suitable for displaying with matplotlib
    if len(img.shape) == 4:
        img = tf.squeeze(img, 0)
    # Display figure with chosen size and resolution
    plt.figure(figsize=figsize, dpi=dpi)
    plt.imshow(tf.clip_by_value(img, 0, 1))
    if title:
        plt.title(title)
    # Remove axes for cleaner output
    plt.axis('off')
    plt.show()

# Display image after preprocessing reversal (from [-1,1] to [0,1])
def show_preprocessed(img, title=None, upscale_to=768):
    # Remove batch dimension if present
    if len(img.shape) == 4:
        img = tf.squeeze(img, 0)
    # Convert from [-1,1] → [0,1]
    vis = tf.clip_by_value((img + 1.0)/2.0, 0, 1)
    # Optionally upscale to uniform size for display
    if upscale_to:
        vis = tf.image.resize(vis, (upscale_to, upscale_to), method='bicubic', antialias=True)
    # Display image with title
    show_display(vis, title)

# Build MobileNetV2 feature extractor given layer names
def mbnet_layers(layer_names, img_size):
    # Load MobileNetV2 without top classification layer, pretrained on ImageNet
    base = mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet',
                                    input_shape=(img_size, img_size, 3))
    base.trainable = False
    # Extract outputs for specified layers
    outputs = [base.get_layer(name).output for name in layer_names]
    return Model([base.input], outputs)

# Compute Gram matrix (style representation) for feature map
def gram_matrix(x):
    b, h, w, c = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
    # Flatten spatial dimensions
    feats = tf.reshape(x, [b, h*w, c])
    # Compute inner product and normalize by number of locations
    return tf.matmul(feats, feats, transpose_a=True) / tf.cast(h*w, tf.float32)

# Model wrapper that returns both style and content features
class StyleContentModel(tf.keras.models.Model):
    def __init__(self, style_layers, content_layers, img_size):
        super().__init__()
        self.style_layers = style_layers
        self.content_layers = content_layers
        self.num_style = len(style_layers)
        # Build encoder that outputs both style and content activations
        self.encoder = mbnet_layers(style_layers + content_layers, img_size)
        self.encoder.trainable = False
    def call(self, x):
        # Forward pass through encoder
        outs = self.encoder(x)
        # First outputs are style, processed via Gram matrices
        style_outs  = [gram_matrix(o) for o in outs[:self.num_style]]
        # Remaining outputs are content features
        content_outs= outs[self.num_style:]
        return {
            'style'  : {n:v for n,v in zip(self.style_layers,  style_outs)},
            'content': {n:v for n,v in zip(self.content_layers, content_outs)}
        }

# Total variation loss encourages smoothness in output
def total_variation_loss(x):
    return tf.image.total_variation(x)

# Laplacian operator for edge preservation (sharpness)
def laplacian(img_minus1_to1):
    # Convert from [-1,1] → [0,1]
    x = (img_minus1_to1 + 1.0)/2.0
    # Define Laplacian kernel
    k = tf.constant([[0.,-1.,0.],[-1.,4.,-1.],[0.,-1.,0.]], tf.float32)
    k = tf.reshape(k, [3,3,1,1])
    # Apply kernel across all 3 channels
    k = tf.repeat(k, repeats=3, axis=2)
    return tf.nn.conv2d(x, k, strides=1, padding='SAME')

# Perform one optimization stage of style transfer
def run_stage(content_path, style_path, img_size,
              style_layers, style_wts, content_layers,
              steps, lr_start, lr_end,
              style_weight=3.0, content_weight=1e2, tv_weight=5e-7, edge_weight=5e-3,
              init_from=None, preview_every=100, title_prefix=""):

    # Function to load and preprocess image for MobileNet
    def load_for_model(path):
        img = tf.io.read_file(path)
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.convert_image_dtype(img, tf.float32)    # [0,1]
        img = tf.image.resize(img, (img_size, img_size), antialias=True)
        # Preprocess to MobileNet input format ([-1,1])
        img = mobilenet_v2.preprocess_input(img * 255.0)
        return img[tf.newaxis, ...]

    # Load content and style images
    content = load_for_model(content_path)
    style   = load_for_model(style_path)

    # Extract target style and content features
    extractor = StyleContentModel(style_layers, content_layers, img_size)
    style_tgt   = extractor(style)['style']
    content_tgt = extractor(content)['content']

    # Initialize optimization variable (either blended or from previous stage)
    if init_from is None:
        # Blend style and content as starting point
        style_for_blend   = tf.image.resize((style+1)/2, (img_size, img_size))
        content_for_blend = (content+1)/2
        alpha = 0.4
        init = tf.clip_by_value((1-alpha)*content_for_blend + alpha*style_for_blend, 0, 1)
        init = mobilenet_v2.preprocess_input(init * 255.0)
        image = tf.Variable(init)
    else:
        # Resize previous output as initialization
        init = tf.image.resize(init_from, (img_size, img_size), method='bicubic', antialias=True)
        image = tf.Variable(init)

    # Define cosine decay learning rate schedule
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=lr_start, decay_steps=steps, alpha=lr_end/lr_start
    )
    opt = tf.optimizers.Adam(learning_rate=lr_schedule)

    # Normalize style weights sum
    denom = sum(style_wts.values())

    # Define single optimization step
    @tf.function
    def train_step(img):
        with tf.GradientTape() as tape:
            outs = extractor(img)
            s, c = outs['style'], outs['content']
            # Style loss: weighted Gram matrix differences
            s_loss = tf.add_n([style_wts[k]*tf.reduce_mean((s[k]-style_tgt[k])**2) for k in s]) * (style_weight/denom)
            # Content loss: MSE between features
            c_loss = tf.add_n([tf.reduce_mean((c[k]-content_tgt[k])**2) for k in c]) * (content_weight/len(content_layers))
            # Total variation loss
            tv = tf.reduce_mean(total_variation_loss(img)) * tv_weight
            # Edge-preserving loss using Laplacian
            edge = tf.reduce_mean((laplacian(img) - laplacian(content))**2) * edge_weight
            # Combine all losses
            loss = s_loss + c_loss + tv + edge
        # Compute gradients
        grad = tape.gradient(loss, img)
        # Clip gradients for stability
        grad = tf.clip_by_norm(grad, 10.0)
        # Apply gradient update
        opt.apply_gradients([(grad, img)])
        # Keep image within valid range
        img.assign(tf.clip_by_value(img, -1, 1))
        return loss

    # Run iterative optimization loop
    for step in range(steps):
        loss = train_step(image)
        # Periodically preview progress
        if (step+1) % preview_every == 0:
            show_preprocessed(image, title=f"{title_prefix} step {step+1}", upscale_to=min(768, img_size))
    return image

# Define MobileNetV2 style layers and their relative weights
STYLE_LAYERS = [
    'block_1_expand_relu','block_3_expand_relu','block_6_expand_relu',
    'block_10_expand_relu','block_13_expand_relu'
]
STYLE_LAYER_WEIGHTS = {
    'block_1_expand_relu': 1.0,
    'block_3_expand_relu': 0.8,
    'block_6_expand_relu': 0.6,
    'block_10_expand_relu': 0.5,
    'block_13_expand_relu': 0.4,
}

# Define content layers for structural preservation
CONTENT_LAYERS = ['block_13_expand_relu', 'block_16_project']

# Base weights for losses, tuned for progressive stages
STYLE_WEIGHT   = 5.0
#Assign content weights for Stage 1, 2 and 3
CONTENT_WEIGHT_S1 = 1e2
CONTENT_WEIGHT_S2 = 2e2
CONTENT_WEIGHT_S3 = 3e2
TV_WEIGHT      = 1e-6
EDGE_WEIGHT    = 5e-3

# Define dataset directories for content and style images
content_dir = '/content/dataset/content'
style_dir   = '/content/dataset/style'

# Collect all valid image files from directories
content_files = [os.path.join(content_dir, f) for f in os.listdir(content_dir)
                 if f.lower().endswith(('.jpg','.jpeg','.png','.webp'))]
style_files   = [os.path.join(style_dir, f) for f in os.listdir(style_dir)
                 if f.lower().endswith(('.jpg','.jpeg','.png','.webp'))]

# Ensure images exist before proceeding
assert content_files, f"No images found in {content_dir}"
assert style_files,   f"No images found in {style_dir}"

# Randomly select one content and one style image
content_path = random.choice(content_files)
style_path   = random.choice(style_files)

# Print chosen file names
print(f"\n Content: {os.path.basename(content_path)}")
print(f" Style:   {os.path.basename(style_path)}")

# Show chosen images
show_display(load_img_display(content_path, 768), "Content (hi-res)")
show_display(load_img_display(style_path, 768),   "Style (hi-res)")

# Stage 1: Optimization at low resolution(224x224)
print("\nStage 1: 224×224")
out_224 = run_stage(content_path, style_path, 224,
    STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
    steps=400, lr_start=0.05, lr_end=0.01,
    style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S1, tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT,
    init_from=None, preview_every=200, title_prefix="224x224")

# Stage 2 : Refinement at medium resolution(512x512)
print("\nStage 2: 512×512 (refine)")
out_512 = run_stage(content_path, style_path, 512,
    STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
    steps=350, lr_start=0.03, lr_end=0.006,
    style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S2, tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT,
    init_from=out_224, preview_every=100, title_prefix="512x512")

# Stage 3: Sharpening at high resolution(768x768) to display clearer outputs
print("\nStage 3: 768×768 (sharpen)")
out_768 = run_stage(content_path, style_path, 768,
    STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
    steps=300, lr_start=0.02, lr_end=0.004,
    style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S3, tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT,
    init_from=out_512, preview_every=100, title_prefix="768x768")

# Show final stylised output
print("\n Final output:")
show_preprocessed(out_768, title="Final Stylized Output (MobileNetV2)", upscale_to=768)


In [None]:
# Import core libraries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os, random, time
from tensorflow.keras.applications import mobilenet_v2
from tensorflow.keras.models import Model


# Load and resize an image for display
def load_img_display(path, target_size=512):
    # Read image from file path
    img = tf.io.read_file(path)
    img = tf.image.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (target_size, target_size), method='bicubic', antialias=True)
    return img

# Show a 3-panel visual comparison (Content vs Style vs Stylised)
def show_mnet_portfolio(content_img01, style_img01, stylized_minus1_to1, title=None):
    # Convert stylized output from [-1,1] → [0,1]
    out_disp = tf.clip_by_value((tf.squeeze(stylized_minus1_to1, 0) + 1.0)/2.0, 0, 1)
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    # Loop through content, style, and stylized images
    for ax, img, lab in zip(
        axes,
        [content_img01, style_img01, out_disp],
        ["Content", "Style", "Stylized"]
    ):
        ax.imshow(tf.clip_by_value(img, 0, 1))
        ax.set_title(lab)
        #Clear axis for clearer output
        ax.axis('off')
    if title:
        fig.suptitle(title, fontsize=14)
    plt.tight_layout()
    plt.show()


# Build MobileNetV2 encoder with chosen layers
def mbnet_layers(layer_names, img_size):
    base = mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet',
                                    input_shape=(img_size, img_size, 3))
    base.trainable = False
    # Extract feature maps for selected layers
    outputs = [base.get_layer(name).output for name in layer_names]
    return Model([base.input], outputs)

# Compute Gram matrix for style representation
def gram_matrix(x):
    b, h, w, c = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
    # Reshape the feature map to convert pixel index to row and channel to column respectively
    # This lets us to compute relations between channels and image
    feats = tf.reshape(x, [b, h*w, c])
    return tf.matmul(feats, feats, transpose_a=True) / tf.cast(h*w, tf.float32)

# Extractor model for both style and content features
class StyleContentModel(tf.keras.models.Model):
    def __init__(self, style_layers, content_layers, img_size):
        super().__init__()
        self.style_layers = style_layers
        self.content_layers = content_layers
        self.num_style = len(style_layers)
        # Build encoder over both style and content layers
        self.encoder = mbnet_layers(style_layers + content_layers, img_size)
        self.encoder.trainable = False
    def call(self, x):
        outs = self.encoder(x)
        # Style outputs or first entries are converted to Gram matrices
        style_outs  = [gram_matrix(o) for o in outs[:self.num_style]]
        # The remaining entries correspond to the chosen content layers
        content_outs= outs[self.num_style:]
        return {
            'style'  : {n:v for n,v in zip(self.style_layers,  style_outs)},
            'content': {n:v for n,v in zip(self.content_layers, content_outs)}
        }

# Compute total variation loss to generate smoother images
def total_variation_loss(x):
    return tf.image.total_variation(x)

# Laplacian filter for edge preservation
def laplacian(img_minus1_to1):
    x = (img_minus1_to1 + 1.0)/2.0
    # Define Laplacian kernel
    k = tf.constant([[0.,-1.,0.],[-1.,4.,-1.],[0.,-1.,0.]], tf.float32)
    k = tf.reshape(k, [3,3,1,1])
    k = tf.repeat(k, repeats=3, axis=2)
    return tf.nn.conv2d(x, k, strides=1, padding='SAME')


# Run one stage of iterative optimization at a chosen resolution
def run_stage(content_path, style_path, img_size,
              style_layers, style_wts, content_layers,
              steps, lr_start, lr_end,
              style_weight=5.0, content_weight=1e2, tv_weight=1e-6, edge_weight=5e-3,
              init_from=None, preview_every=999999, title_prefix=""):

    # Load and preprocess image into MobileNet format
    def load_for_model(path):
        img = tf.io.read_file(path)
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.convert_image_dtype(img, tf.float32)    # [0,1]
        img = tf.image.resize(img, (img_size, img_size), antialias=True)
        img = mobilenet_v2.preprocess_input(img * 255.0)       # [-1,1]
        return img[tf.newaxis, ...]

    # Load content and style images
    content = load_for_model(content_path)
    style   = load_for_model(style_path)

    # Extract target style or content features
    extractor = StyleContentModel(style_layers, content_layers, img_size)
    style_tgt   = extractor(style)['style']
    content_tgt = extractor(content)['content']

    # Initialize optimization variable
    if init_from is None:
        # Blend style + content for smoother start
        style_for_blend   = tf.image.resize((style+1)/2, (img_size, img_size))
        content_for_blend = (content+1)/2
        alpha = 0.4
        init = tf.clip_by_value((1-alpha)*content_for_blend + alpha*style_for_blend, 0, 1)
        init = mobilenet_v2.preprocess_input(init * 255.0)
        image = tf.Variable(init)
    else:
        # Resize previous stage output as init
        init = tf.image.resize(init_from, (img_size, img_size), method='bicubic', antialias=True)
        image = tf.Variable(init)

    # Cosine decay learning rate
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=lr_start, decay_steps=steps, alpha=lr_end/lr_start
    )
    opt = tf.optimizers.Adam(learning_rate=lr_schedule)
    denom = sum(style_wts.values())

    # Training step (gradient descent)
    @tf.function
    def train_step(img):
        with tf.GradientTape() as tape:
            outs = extractor(img)
            s, c = outs['style'], outs['content']
            # Style loss: weightedaverage of Gram matrix differences
            s_loss = tf.add_n([style_wts[k]*tf.reduce_mean((s[k]-style_tgt[k])**2) for k in s]) * (style_weight/denom)
            # Content loss: calculate MSE (mean squared error) between content features
            c_loss = tf.add_n([tf.reduce_mean((c[k]-content_tgt[k])**2) for k in c]) * (content_weight/len(content_layers))
            # Total variation loss
            tv = tf.reduce_mean(total_variation_loss(img)) * tv_weight
            # Edge preservation via Laplacian
            edge = tf.reduce_mean((laplacian(img) - laplacian(content))**2) * edge_weight
            # Combined loss
            loss = s_loss + c_loss + tv + edge
        # Compute gradients
        grad = tape.gradient(loss, img)
        # Clip gradients for stability
        grad = tf.clip_by_norm(grad, 10.0)
        # Apply gradient update
        opt.apply_gradients([(grad, img)])
        img.assign(tf.clip_by_value(img, -1, 1))
        return loss

    # Run training loop for given steps
    for step in range(steps):
        _ = train_step(image)

    # Return final optimized image
    return image


# Style and content layers from MobileNet
STYLE_LAYERS = [
    'block_1_expand_relu','block_3_expand_relu','block_6_expand_relu',
    'block_10_expand_relu','block_13_expand_relu'
]
STYLE_LAYER_WEIGHTS = {
    'block_1_expand_relu': 1.0,
    'block_3_expand_relu': 0.8,
    'block_6_expand_relu': 0.6,
    'block_10_expand_relu': 0.5,
    'block_13_expand_relu': 0.4,
}
CONTENT_LAYERS = ['block_13_expand_relu', 'block_16_project']

# Loss weights tuned for progressive stages
STYLE_WEIGHT   = 5.0
CONTENT_WEIGHT_S1 = 1e2
CONTENT_WEIGHT_S2 = 2e2
CONTENT_WEIGHT_S3 = 3e2
TV_WEIGHT      = 1e-6
EDGE_WEIGHT    = 5e-3


# Resolve dataset paths depending on folder structure
def resolve_dirs():
    for c,s in [
        ("dataset/dataset/content","dataset/dataset/style"),
        ("datasets/datasets/content","datasets/datasets/style"),
        ("dataset/content","dataset/style"),
    ]:
        if os.path.isdir(c) and os.path.isdir(s):
            return c,s
    raise FileNotFoundError("Content/style folders not found.")

content_dir, style_dir = resolve_dirs()

# Collect content and style images
content_files = [os.path.join(content_dir, f) for f in os.listdir(content_dir)
                 if f.lower().endswith(('.jpg','.jpeg','.png','.webp'))]
style_files   = [os.path.join(style_dir, f) for f in os.listdir(style_dir)
                 if f.lower().endswith(('.jpg','.jpeg','.png','.webp'))]

# Ensure lists are not empty
assert content_files, f"No images in {content_dir}"
assert style_files,   f"No images in {style_dir}"

# Initialise Portfolio loop
# Number of example triplets(content|style|stylised) to generate
NUM_SAMPLES = 6

for i in range(NUM_SAMPLES):
    # Randomly choose one content and one style from the dataset
    content_path = random.choice(content_files)
    style_path   = random.choice(style_files)
    print(f"\nPair {i+1}: {os.path.basename(content_path)} + {os.path.basename(style_path)}")

    # Load images for display at 512px
    content_disp = load_img_display(content_path, 512)
    style_disp   = load_img_display(style_path,   512)

    # Stage 1: low-res optimization (224×224)
    out_224 = run_stage(content_path, style_path, 224,
        STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
        steps=400, lr_start=0.05, lr_end=0.01,
        style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S1,
        tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT)

    # Stage 2: refine at 512×512
    out_512 = run_stage(content_path, style_path, 512,
        STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
        steps=350, lr_start=0.03, lr_end=0.006,
        style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S2,
        tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT,
        init_from=out_224)

    # Stage 3: sharpen at 768×768
    out_768 = run_stage(content_path, style_path, 768,
        STYLE_LAYERS, STYLE_LAYER_WEIGHTS, CONTENT_LAYERS,
        steps=300, lr_start=0.02, lr_end=0.004,
        style_weight=STYLE_WEIGHT, content_weight=CONTENT_WEIGHT_S3,
        tv_weight=TV_WEIGHT, edge_weight=EDGE_WEIGHT,
        init_from=out_512)

    # Show triplet panel (content|style|stylised)
    show_mnet_portfolio(content_disp, style_disp, out_768, title=f"Stylization {i+1}")


In [None]:
import gradio as gr
import tensorflow as tf
import numpy as np
from tensorflow.keras.applications import mobilenet_v2
from tensorflow.keras.models import Model
from PIL import Image, ImageFilter
import cv2

# Define which MobileNetV2 layers will provide style information
STYLE_LAYERS = [
    'block_1_expand_relu','block_3_expand_relu','block_6_expand_relu',
    'block_10_expand_relu','block_13_expand_relu'
]

# Assign relative importance (weights) to each style layer
STYLE_LAYER_WEIGHTS = {
    'block_1_expand_relu': 1.0,
    'block_3_expand_relu': 0.8,
    'block_6_expand_relu': 0.6,
    'block_10_expand_relu': 0.5,
    'block_13_expand_relu': 0.4,
}

# Use deeper layers for content since they capture stronger semantic structure
CONTENT_LAYERS = ['block_13_expand_relu', 'block_16_project']


# Build a MobileNetV2 encoder that outputs chosen feature maps
def mbnet_layers(layer_names, img_size):
    # Load pretrained MobileNetV2 without classifier head
    base = mobilenet_v2.MobileNetV2(
        include_top=False, weights='imagenet', input_shape=(img_size, img_size, 3)
    )
    # Freeze parameters to avoid training
    base.trainable = False
    # Select intermediate layers that will serve as style/content features
    outputs = [base.get_layer(name).output for name in layer_names]
    return Model([base.input], outputs)

# Compute Gram matrix, which encodes correlations between feature maps (style patterns)
def gram_matrix(x):
    b, h, w, c = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
    # Flatten spatial dimensions: each row = one pixel location, columns = channels
    feats = tf.reshape(x, [b, h*w, c])
    # Multiply with transpose to compute channel-to-channel correlations
    return tf.matmul(feats, feats, transpose_a=True) / tf.cast(h*w, tf.float32)

# Model wrapper to return both style and content features
class StyleContentModel(tf.keras.models.Model):
    def __init__(self, style_layers, content_layers, img_size):
        super().__init__()
        self.style_layers = style_layers
        self.content_layers = content_layers
        self.num_style = len(style_layers)
        # Create encoder with all required layers
        self.encoder = mbnet_layers(style_layers + content_layers, img_size)
        self.encoder.trainable = False
    def call(self, x):
        # Forward pass to get feature maps
        outs = self.encoder(x)
        # Convert style features into Gram matrices
        style_outs  = [gram_matrix(o) for o in outs[:self.num_style]]
        # Remaining outputs correspond to chosen content layers
        content_outs= outs[self.num_style:]
        return {
            'style'  : {n:v for n,v in zip(self.style_layers,  style_outs)},
            'content': {n:v for n,v in zip(self.content_layers, content_outs)}
        }

# Regularization loss to encourage smoothness in final image
def total_variation_loss(x):
    return tf.image.total_variation(x)

# Laplacian filter to preserve edges from content image
def laplacian(img_minus1_to1):
    # Convert [-1,1] image back to [0,1]
    x = (img_minus1_to1 + 1.0)/2.0
    # Define Laplacian kernel to detect edges
    k = tf.constant([[0.,-1.,0.],[-1.,4.,-1.],[0.,-1.,0.]], tf.float32)
    k = tf.reshape(k, [3,3,1,1])
    # Apply same kernel across RGB channels
    k = tf.repeat(k, 3, axis=2)
    return tf.nn.conv2d(x, k, strides=1, padding='SAME')


# Convert PIL image to MobileNetV2 preprocessed tensor
def preprocess_for_model(pil_img, img_size):
    # Convert to numpy and normalize to [0,1]
    x = np.array(pil_img).astype(np.float32) / 255.0
    # Resize to target resolution
    x = tf.image.resize(x, (img_size, img_size), antialias=True)
    # Apply MobileNet preprocessing (scale to [-1,1])
    x = mobilenet_v2.preprocess_input(x * 255.0)
    return x[tf.newaxis, ...]

# Convert model tensor back to displayable image [0,1]
def deprocess_from_model(x_minus1_to1):
    x = tf.squeeze(x_minus1_to1, 0)  # remove batch dimension
    x = tf.clip_by_value((x + 1.0)/2.0, 0.0, 1.0)
    return x.numpy().astype(np.float32)


# Gather style images and normalize their blending weights
def collect_styles_and_weights(styles, weights):
    pairs = [(img, w) for img, w in zip(styles, weights) if img is not None]
    if not pairs:
        raise ValueError("At least one style image must be provided.")
    imgs, ws = zip(*pairs)
    total = float(sum(ws))
    # Normalize so weights sum to 1
    if total <= 1e-8:
        ws = [1.0] + [0.0]*(len(imgs)-1)
    else:
        ws = [float(w)/total for w in ws]
    return list(imgs), ws

# Blend multiple style targets by weighted averaging Gram matrices
def blended_style_targets(extractor, style_pils, blend_ws, img_size):
    accum = None
    for img, w in zip(style_pils, blend_ws):
        st = extractor(preprocess_for_model(img, img_size))['style']
        if accum is None:
            accum = {k: v * w for k, v in st.items()}
        else:
            for k in accum:
                accum[k] += st[k] * w
    return accum


# Run one stage of optimization at a given resolution
def run_stage(content_pil, style_pils, blend_ws, img_size,
              content_layers, style_layers, style_layer_weights,
              steps, lr_start, lr_end,
              alpha_content, beta_style, tv_weight=1e-6, edge_weight=5e-3,
              init_from=None):

    # Preprocess content image
    content = preprocess_for_model(content_pil, img_size)
    # Extractor for style and content
    extractor = StyleContentModel(style_layers, content_layers, img_size)
    style_tgt   = blended_style_targets(extractor, style_pils, blend_ws, img_size)
    content_tgt = extractor(content)['content']

    # Initialize optimization image
    if init_from is None:
        # Blend content and style seed for smoother start
        style_seed = preprocess_for_model(style_pils[0], img_size)
        seed = tf.clip_by_value(((content+1)/2.0)*0.6 + ((style_seed+1)/2.0)*0.4, 0, 1)
        image = tf.Variable(mobilenet_v2.preprocess_input(seed * 255.0))
    else:
        # Upsample from previous stage output
        up = tf.image.resize(init_from, (img_size, img_size), method='bicubic', antialias=True)
        image = tf.Variable(up)

    # Learning rate schedule for stability
    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
        initial_learning_rate=lr_start, decay_steps=steps, alpha=lr_end/lr_start
    )
    opt = tf.optimizers.Adam(learning_rate=lr_schedule)
    denom = sum(style_layer_weights.values())

    # Training step computing all losses
    @tf.function
    def train_step(img):
        with tf.GradientTape() as tape:
            outs = extractor(img)
            s, c = outs['style'], outs['content']
            # Style loss: Gram differences weighted by layer importance
            s_loss = tf.add_n([style_layer_weights[k]*tf.reduce_mean((s[k]-style_tgt[k])**2) for k in s]) * (beta_style/denom)
            # Content loss: mean squared error between features
            c_loss = tf.add_n([tf.reduce_mean((c[k]-content_tgt[k])**2) for k in c]) * (alpha_content/len(content_layers))
            # Total variation for smoothness
            tv = tf.reduce_mean(total_variation_loss(img)) * tv_weight
            # Laplacian to preserve edges
            edge = tf.reduce_mean((laplacian(img) - laplacian(content))**2) * edge_weight
            # Total loss
            loss = s_loss + c_loss + tv + edge
        # Compute gradients
        grad = tape.gradient(loss, img)
        # Clip gradients to avoid exploding updates
        grad = tf.clip_by_norm(grad, 10.0)
        # Apply updates
        opt.apply_gradients([(grad, img)])
        # Clamp pixel values to valid range
        img.assign(tf.clip_by_value(img, -1.0, 1.0))
        return loss

    # Run multiple optimization steps
    for _ in range(steps):
        _ = train_step(image)

    return image


# Convert numpy array to float [0,1]
def to_float01(np_img):
    if np_img.dtype == np.uint8:
        arr = np_img.astype(np.float32) / 255.0
    else:
        arr = np_img.astype(np.float32)
    return np.clip(arr, 0.0, 1.0)

# Create binary mask where black pixels = foreground region
def make_black_mask_from_np(content_np_resized, threshold=0.05):
    mask = np.all(content_np_resized < threshold, axis=-1).astype(np.float32)
    return mask[..., None]

# Preserve original colors by transferring only luminance channel
def apply_colour_preservation(stylized_f01, content_resized_f01):
    # OpenCV expects 8-bit inputs for colour conversion
    # So, Convert both stylised and content images to uint8 [0,255]
    stylized_u8 = (np.clip(stylized_f01, 0, 1) * 255).astype(np.uint8)
    content_u8  = (np.clip(content_resized_f01, 0, 1) * 255).astype(np.uint8)
    # Convert both style and content images from RGB to YUV colour space
    stylized_yuv = cv2.cvtColor(stylized_u8, cv2.COLOR_RGB2YUV)
    content_yuv  = cv2.cvtColor(content_u8,  cv2.COLOR_RGB2YUV)
    # Replace content’s luminance channel with stylized luminance channel to keep content colour
    combined_yuv = content_yuv.copy()
    combined_yuv[..., 0] = stylized_yuv[..., 0]
    # Convert the final YUV image back to RGB
    final_rgb = cv2.cvtColor(combined_yuv, cv2.COLOR_YUV2RGB)
    return final_rgb.astype(np.float32) / 255.0

# Apply sharpening using UnsharpMask filter
def apply_sharpness(stylized_f01, sharpness_value):
    img_u8 = (np.clip(stylized_f01, 0, 1) * 255).astype(np.uint8)
    pil_img = Image.fromarray(img_u8)
    pil_img = pil_img.filter(ImageFilter.UnsharpMask(radius=1.5, percent=int(sharpness_value * 200)))
    return np.asarray(pil_img).astype(np.float32) / 255.0


# Main Gradio callback function
def stylize_ui(content,
               s1, s2, s3, s4, s5,
               w1, w2, w3, w4, w5,
               alpha, beta, preserve_colour, fg_only, sharpness):
    # Collect style images and normalize weights
    style_imgs, blend_ws = collect_styles_and_weights(
        [s1, s2, s3, s4, s5], [w1, w2, w3, w4, w5]
    )

    # Stage 1 optimization (low resolution 224x224)
    out_224 = run_stage(content, style_imgs, blend_ws, 224,
        CONTENT_LAYERS, STYLE_LAYERS, STYLE_LAYER_WEIGHTS,
        steps=400, lr_start=0.05, lr_end=0.01,
        alpha_content=alpha, beta_style=beta)

    # Stage 2 optimization (refine at 512x512)
    out_512 = run_stage(content, style_imgs, blend_ws, 512,
        CONTENT_LAYERS, STYLE_LAYERS, STYLE_LAYER_WEIGHTS,
        steps=350, lr_start=0.03, lr_end=0.006,
        alpha_content=alpha*2.0, beta_style=beta,
        init_from=out_224)

    # Stage 3 optimization (sharpen at 768x768) to give clearer stylised outputs
    out_768 = run_stage(content, style_imgs, blend_ws, 768,
        CONTENT_LAYERS, STYLE_LAYERS, STYLE_LAYER_WEIGHTS,
        steps=300, lr_start=0.02, lr_end=0.004,
        alpha_content=alpha*3.0, beta_style=beta,
        init_from=out_512)

    # Convert final output back to [0,1] and resize output to 512 for display
    stylized = deprocess_from_model(out_768)
    stylized = tf.image.resize(stylized, (512, 512), antialias=True, method='bicubic').numpy()

    # Prepare resized content for optional colour or foreground operations
    content_np = to_float01(np.array(content))
    content_resized = np.array(
        Image.fromarray((content_np * 255).astype(np.uint8)).resize((512, 512), Image.BILINEAR)
    ).astype(np.float32) / 255.0

    # Apply optional colour preservation
    if preserve_colour:
        stylized = apply_colour_preservation(stylized, content_resized)

    # Apply optional sharpness
    if sharpness and sharpness > 0:
        stylized = apply_sharpness(stylized, sharpness)

    # Apply foreground-only masking
    if fg_only:
        mask = make_black_mask_from_np(content_resized, threshold=0.05)
        stylized = mask * stylized + (1.0 - mask) * content_resized

    return np.clip(stylized, 0.0, 1.0)


# Gradio UI definition
with gr.Blocks(css="""
  body { background-color: #fafafa; font-family: 'Helvetica Neue', sans-serif; }
  h1, h2 { color: #2a4d8f; }
  .gr-button { background: #2a4d8f !important; color: #ffffff !important; border-radius: 10px !important; }
  .gr-button:hover { background: #3c6bd9 !important; }
  .gr-image { border-radius: 14px; box-shadow: 0 5px 12px rgba(0,0,0,0.12); }
""") as demo:

    # App title and description
    gr.Markdown("## MobileNetNST Multi-Style Transfer App — MobileNetV2")
    gr.Markdown("Upload a content image and up to five style images. Adjust blending weights, α/β, color preservation, sharpness, and foreground-only styling. Runs MobileNetV2 NST with Gram matrices, TV loss, Laplacian edge term, and a 3-stage coarse→fine schedule.")

    # Content image input
    with gr.Row():
        content_input = gr.Image(label="Content Image", image_mode="RGB", height=256, width=256, type="pil")

    # Style image inputs
    with gr.Row():
        style_input1  = gr.Image(label="Style 1", image_mode="RGB", height=180, width=180, type="pil")
        style_input2  = gr.Image(label="Style 2 (Optional)", image_mode="RGB", height=180, width=180, type="pil")
        style_input3  = gr.Image(label="Style 3 (Optional)", image_mode="RGB", height=180, width=180, type="pil")
        style_input4  = gr.Image(label="Style 4 (Optional)", image_mode="RGB", height=180, width=180, type="pil")
        style_input5  = gr.Image(label="Style 5 (Optional)", image_mode="RGB", height=180, width=180, type="pil")

    # Sliders for style blending strengths
    with gr.Row():
        blend1 = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Style 1 Strength")
        blend2 = gr.Slider(0.0, 1.0, value=0.0, step=0.01, label="Style 2 Strength")
        blend3 = gr.Slider(0.0, 1.0, value=0.0, step=0.01, label="Style 3 Strength")
        blend4 = gr.Slider(0.0, 1.0, value=0.0, step=0.01, label="Style 4 Strength")
        blend5 = gr.Slider(0.0, 1.0, value=0.0, step=0.01, label="Style 5 Strength")

    #  Sliders and checkboxes for hyperparameters
    with gr.Row():
        alpha_slider = gr.Slider(minimum=1e3, maximum=1e5, value=1e4, step=100, label="α (Content Weight)")
        beta_slider  = gr.Slider(minimum=1e-3, maximum=1.0, value=1e-2, step=1e-3, label="β (Style Weight)")
        preserve_colour  = gr.Checkbox(label="Apply Colour Prservation", value=False)
        fg_only         = gr.Checkbox(label="Apply Foreground Styling", value=False)
        sharpness       = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="Style Sharpness")

    # Stylise button and output image
    run_button   = gr.Button("Stylise")
    output_image = gr.Image(label="Stylised Output", image_mode="RGB", height=512, width=512)

    # Connect button to callback
    run_button.click(
        fn=stylize_ui,
        inputs=[
            content_input,
            style_input1, style_input2, style_input3, style_input4, style_input5,
            blend1, blend2, blend3, blend4, blend5,
            alpha_slider, beta_slider, preserve_colour, fg_only, sharpness
        ],
        outputs=output_image
    )

    gr.Markdown("Built with TensorFlow and Gradio for MobileNet NST ")

demo.launch()
