## **Full InST Pipeline with Integrated Textual Inversion Fine Tuned**

##  **Setup Dependencies**

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision diffusers transformers


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-f1_7xi_h
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-f1_7xi_h
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=eb1bf9567aed02ce61fb669fc2cde59d942b136d9377dfa1ac2be9e609c6c891
  Stored in directory: /tmp/pip-ephem-wheel-cache-7imesweu/wheels/da/2b/4c/d6691fa9597aac8bb

## **Import Libraries**

In [None]:
import torch
import torch.nn as nn
import clip
from torchvision import transforms
from PIL import Image
from diffusers import StableDiffusionPipeline, PNDMScheduler

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:


# CLIP Image Encoder
class CLIPImageEncoder(nn.Module):
    def __init__(self, device="cuda"):
        super().__init__()
        self.device = device
        self.clip_model, self.preprocess = clip.load("ViT-B/32", device=device)

    def forward(self, image_path):
        image = Image.open(image_path).convert("RGB")
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image_input)
        return image_features

# Attention-Based Inversion
class AttentionModule(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.scale = embed_dim ** -0.5

    def forward(self, v):
        Q = self.query(v)
        K = self.key(v)
        V = self.value(v)
        attention_weights = torch.softmax((Q @ K.transpose(-2, -1)) * self.scale, dim=-1)
        return attention_weights @ V

# Textual Inversion Module
class TextualInversion(nn.Module):
    def __init__(self, embed_dim=512, target_dim=768, device="cuda"):
        super().__init__()
        self.clip_encoder = CLIPImageEncoder(device)
        self.attention = AttentionModule(embed_dim).to(device)
        self.linear_proj = nn.Linear(embed_dim, target_dim).to(device)
        self.device = device

    def forward(self, image_path):
        # Encode image features
        image_embed = self.clip_encoder(image_path).to(torch.float32)
        # Refine with attention
        refined_embed = self.attention(image_embed)
        # Project to target dimension
        projected_embed = self.linear_proj(refined_embed)
        return projected_embed.unsqueeze(1)  # (batch_size, 1, target_dim)

# Stochastic Inversion Module
class StochasticInversion:
    def __init__(self, unet_model, scheduler):
        self.unet = unet_model
        self.scheduler = scheduler

    def add_noise(self, latent, noise_level):
        noise = torch.randn_like(latent)
        return latent + noise_level * noise

    def denoise(self, latent_noisy, time_steps, conditioning):
        for t in reversed(time_steps):
            timestep = torch.tensor([t], dtype=torch.long, device=latent_noisy.device)
            noise_pred = self.unet(latent_noisy, timestep, encoder_hidden_states=conditioning).sample
            latent_noisy = self.scheduler.step(noise_pred, t, latent_noisy).prev_sample
        return latent_noisy

# Final InST Pipeline
class InSTPipeline:
    def __init__(self, model_path="CompVis/stable-diffusion-v1-4", device="cuda"):
        self.device = torch.device(device)
        self.scheduler = PNDMScheduler(steps_offset=1)

        # Load Stable Diffusion pipeline and U-Net model
        self.sd_pipeline = StableDiffusionPipeline.from_pretrained(
            model_path, scheduler=self.scheduler
        ).to(self.device)
        self.unet = self.sd_pipeline.unet

        # Initialize modules
        self.textual_inversion = TextualInversion(device=device)
        self.stochastic_inversion = StochasticInversion(self.unet, self.scheduler)

    def generate(self, content_image_path, reference_image_path, guidance_scale=7.5):
        """
        Generate a stylized image using content and reference images.
        Args:
            content_image_path (str): Path to the content image.
            reference_image_path (str): Path to the reference image.
            guidance_scale (float): Controls classifier-free guidance.

        Returns:
            PIL.Image: Generated stylized image.
        """
        # Embed the reference image (style embedding)
        style_embedding = self.textual_inversion(reference_image_path)

        # Generate negative_prompt_embeds as zero tensors matching the style embedding
        negative_prompt_embeds = torch.zeros_like(style_embedding)

        # Preprocess the content image
        preprocess = transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])
        content_image = Image.open(content_image_path).convert("RGB")
        content_tensor = preprocess(content_image).unsqueeze(0).to(self.device)

        # Encode the content image into latent space
        with torch.no_grad():
            content_latent = self.sd_pipeline.vae.encode(content_tensor).latent_dist.sample()
            content_latent = content_latent * 0.18215  # Scale as required by Stable Diffusion

        # Add noise to the latent representation of the content
        noisy_latents = self.stochastic_inversion.add_noise(content_latent, noise_level=0.1)

        # Generate stylized image
        print("Generating stylized image...")
        with torch.no_grad():
            output = self.sd_pipeline(
                prompt_embeds=style_embedding,                # Positive embeddings
                negative_prompt_embeds=negative_prompt_embeds,  # Negative embeddings
                guidance_scale=guidance_scale,
                latents=noisy_latents
            ).images[0]

        return output




In [None]:
# Initialize pipeline
pipeline = InSTPipeline(model_path="CompVis/stable-diffusion-v1-4", device="cuda")

# File paths
content_image_path = "/content/jeevannn.jpg"         # Path to content image
reference_image_path = "/content/images_van.jpeg"  # Path to reference image

# Generate stylized image
output_image = pipeline.generate(content_image_path, reference_image_path)

# Save the result
output_image.save("output_stylized_ju_image.png")
print("Stylized image saved successfully!")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Generating stylized image...


  0%|          | 0/50 [00:00<?, ?it/s]

Stylized image saved successfully!


## Fine Tuned Text Inversion

In [None]:
from diffusers import StableDiffusionPipeline, PNDMScheduler
import torch
from torch import nn
from torchvision import transforms
from PIL import Image
import clip

# CLIP Image Encoder
class CLIPImageEncoder(nn.Module):
    def __init__(self, device="cuda"):
        super().__init__()
        self.device = device
        self.clip_model, self.preprocess = clip.load("ViT-B/32", device=device)

    def forward(self, image_path):
        image = Image.open(image_path).convert("RGB")
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image_input)
        return image_features

# Attention-Based Inversion
class AttentionModule(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.scale = embed_dim ** -0.5

    def forward(self, v):
        Q = self.query(v)
        K = self.key(v)
        V = self.value(v)
        attention_weights = torch.softmax((Q @ K.transpose(-2, -1)) * self.scale, dim=-1)
        return attention_weights @ V

# Textual Inversion Module
class TextualInversion(nn.Module):
    def __init__(self, embed_dim=512, target_dim=768, learned_embed_path=None, device="cuda"):
        super().__init__()
        self.device = device
        self.learned_embed_path = learned_embed_path
        self.embed_dim = embed_dim
        self.target_dim = target_dim

        # Load the fine-tuned textual inversion embeddings
        if self.learned_embed_path:
            loaded_data = torch.load(self.learned_embed_path, map_location=device)
            if isinstance(loaded_data, dict):
                # Extract embeddings using the key
                key = "<vangogh-style>"  # Adjust if the key changes
                self.learned_embeddings = loaded_data[key]
            else:
                self.learned_embeddings = loaded_data
        else:
            raise ValueError("learned_embed_path must be specified to load fine-tuned embeddings.")

    def forward(self, image_path=None):
        return self.learned_embeddings.to(self.device).unsqueeze(0)  # Add batch dimension if necessary


# Stochastic Inversion Module
class StochasticInversion:
    def __init__(self, unet_model, scheduler):
        self.unet = unet_model
        self.scheduler = scheduler

    def add_noise(self, latent, noise_level):
        noise = torch.randn_like(latent)
        return latent + noise_level * noise

    def denoise(self, latent_noisy, time_steps, conditioning):
        for t in reversed(time_steps):
            timestep = torch.tensor([t], dtype=torch.long, device=latent_noisy.device)
            noise_pred = self.unet(latent_noisy, timestep, encoder_hidden_states=conditioning).sample
            latent_noisy = self.scheduler.step(noise_pred, t, latent_noisy).prev_sample
        return latent_noisy

# Final InST Pipeline
class InSTPipeline:
    def __init__(self, model_path="CompVis/stable-diffusion-v1-4", device="cuda"):
        self.device = torch.device(device)
        self.scheduler = PNDMScheduler(steps_offset=1)

        # Load Stable Diffusion pipeline and U-Net model
        self.sd_pipeline = StableDiffusionPipeline.from_pretrained(
            model_path, scheduler=self.scheduler
        ).to(self.device)
        self.unet = self.sd_pipeline.unet

        # Initialize modules
        self.textual_inversion = TextualInversion(
            learned_embed_path="/content/drive/MyDrive/Pipeline_2/textual_inversion_vangogh/learned_embeds.bin",
            device=device
        )

        self.stochastic_inversion = StochasticInversion(self.unet, self.scheduler)

    def generate(self, content_image_path, reference_image_path, guidance_scale=7.5):
        """
        Generate a stylized image using content and reference images.
        Args:
            content_image_path (str): Path to the content image.
            reference_image_path (str): Path to the reference image.
            guidance_scale (float): Controls classifier-free guidance.

        Returns:
            PIL.Image: Generated stylized image.
        """
        # Embed the reference image (style embedding)
        style_embedding = self.textual_inversion(reference_image_path)

        # Generate negative_prompt_embeds as zero tensors matching the style embedding
        negative_prompt_embeds = torch.zeros_like(style_embedding)

        # Preprocess the content image
        preprocess = transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])
        content_image = Image.open(content_image_path).convert("RGB")
        content_tensor = preprocess(content_image).unsqueeze(0).to(self.device)

        # Encode the content image into latent space
        with torch.no_grad():
            content_latent = self.sd_pipeline.vae.encode(content_tensor).latent_dist.sample()
            content_latent = content_latent * 0.18215  # Scale as required by Stable Diffusion

        # Add noise to the latent representation of the content
        noisy_latents = self.stochastic_inversion.add_noise(content_latent, noise_level=0.1)

        # Generate stylized image
        print("Generating stylized image...")
        with torch.no_grad():
            output = self.sd_pipeline(
                prompt_embeds=style_embedding,                # Positive embeddings
                negative_prompt_embeds=negative_prompt_embeds,  # Negative embeddings
                guidance_scale=guidance_scale,
                latents=noisy_latents
            ).images[0]

        return output




In [None]:
# Initialize pipeline
pipeline = InSTPipeline(model_path="CompVis/stable-diffusion-v1-4", device="cuda")

# File paths
content_image_path = "/content/natures.jpeg"         # Path to content image
reference_image_path = "/content/starrynight.jpg"  # Path to reference image

# Generate stylized image
output_image = pipeline.generate(content_image_path, reference_image_path)

# Save the result
output_image.save("output_stylized_nat_image.png")
print("Stylized image saved successfully!")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Generating stylized image...


  loaded_data = torch.load(self.learned_embed_path, map_location=device)


  0%|          | 0/50 [00:00<?, ?it/s]

Stylized image saved successfully!
