In [1]:
import torch
import torch.nn as nn
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
from transformers import CLIPTextModel, CLIPTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import ChineseCLIPProcessor, ChineseCLIPModel, ChineseCLIPConfig
from models import MLP
import os
from dataset.utils import pre_caption
import ast

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
torch.cuda.empty_cache()

CUDA_VISIBLE_DEVICES=4
CUDA_LAUNCH_BLOCKING=1
TORCH_USE_CUDA_DSA=1

device = "cuda"
weight_dtype = torch.float32
torch.set_default_dtype(weight_dtype)
g = torch.Generator(device=device)
seed = g.seed()

print(device, seed)

cuda 3765926384200155


In [3]:
prompt_coarse_ZH = "绵长蜿蜒的海滩，热带风情，明亮，简单"
prompt_coarse_original = " A long and winding beach, tropical, bright, simple"
prompt_fine = "A long and winding beach, tropical, bright, simple, by Studio Ghibli and Greg Rutkowski, artstation",

prompt = pre_caption(prompt_coarse_ZH, 256)
print(prompt)
prompt_coarse_original = pre_caption(prompt_coarse_original, 256)
print(prompt_coarse_original)


绵长蜿蜒的海滩，热带风情，明亮，简单
a long and winding beach, tropical, bright, simple


In [4]:
def get_pipeline_embeds(tokenizer, text_encoder,linear, prompt, device, negative_prompt=""):
    """Get pipeline embeds for prompts bigger than the maxlength of the pipe
    :param pipeline:
    :param prompt:
    :param negative_prompt:
    :param device:
    :return:
    """
    max_length = 77

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).long()

    negative_ids = tokenizer(negative_prompt, truncation=False, padding="max_length", max_length=input_ids.shape[-1], return_tensors="pt").input_ids
    negative_ids = negative_ids.to(device)
    attention_mask_negative = (negative_ids != tokenizer.pad_token_id).long()

    example_embeds = text_encoder(input_ids, attention_mask=attention_mask)[0]
    print("Example Embeddings:", example_embeds.size(-1), example_embeds.dtype)
    
    weight_dtype = example_embeds.dtype
    
 
    ## Linear Projection from 768 to 1024
    # linear = nn.Linear(example_embeds.size()[-1], 1024).to(device).to(weight_dtype)
    
    ## MLP
    # linear = MLP(input_dim=example_embeds.size()[-1], output_dim=1024, dtype=weight_dtype).to(device)
    
    linear = linear
    
    concat_embeds = []
    neg_embeds = []
    for i in range(0, input_ids.shape[-1], max_length):
        concat_embeds.append(
            linear(text_encoder(input_ids[:, i : i + max_length], attention_mask=attention_mask)[0])
        )
        neg_embeds.append(
            linear(text_encoder(negative_ids[:, i : i + max_length], attention_mask=attention_mask_negative)[0])
        )

    # ## Adding zero padding to embeds
    # # padding = torch.zeros((1, example_embeds.size(1), 1024 - example_embeds.size(2)), dtype=weight_dtype, device=device)

    # # or Adding Gaussian Noise to embeds
    # padding = torch.randn((1, example_embeds.size(1), 1024 - example_embeds.size(2)), dtype=weight_dtype, device=device)
    # # uncond_input = tokenizer(
    # #     [""] * example_embeds.size(1), padding="max_length", max_length=1024 - example_embeds.size(2), return_tensors="pt"
    # # )
    # # padding = text_encoder(uncond_input.input_ids.to(device))[0]
    
    # print("Padding: ", padding.size(), padding.dtype)
        
    # concat_embeds = []
    # neg_embeds = []
    # for i in range(0, input_ids.shape[-1], max_length):
    #     concat_embeds.append(
    #         torch.cat([text_encoder(input_ids[:, i : i + max_length], attention_mask=attention_mask)[0], padding], dim=2))
    #     neg_embeds.append(
    #         torch.cat([text_encoder(negative_ids[:, i : i + max_length], attention_mask=attention_mask_negative)[0], padding], dim=2))

    
    prompt_embeds = torch.cat(concat_embeds, dim=1)
    negative_prompt_embeds = torch.cat(neg_embeds, dim=1)
    
    print(prompt_embeds.size(), negative_prompt_embeds.size())

    # prompt_embeds = linear(prompt_embeds)
    # negative_prompt_embeds = linear(negative_prompt_embeds)
    
    return prompt_embeds, negative_prompt_embeds


In [5]:
def pipe_rum(id, prompt, pipe, tokenizer, text_encoder, g, device, negative_prompt, height=512, width=512, step=50, cfg=7.5):
    dic = {}
    dic["id"] = id

    # for idx in range(0, 1):
    # prompt = ast.literal_eval(raw_data[f"ans_{idx}"])["prompt"].decode("utf-8")
    # print(prompt)
    # dic[f"prompt_{idx}"] = prompt
    seed = g.seed()
    prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(tokenizer, text_encoder, prompt, device, negative_prompt)
    print(prompt_embeds.size(), negative_prompt_embeds.size())
    image = pipe(
        # prompt,
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
        height=height,
        width=width,
        generator=g,
        num_inference_steps=step,
        guidance_scale=cfg,
        seed=seed,
    ).images[0]
    img_name = str(id).zfill(6)

    # output_path = f"/data/mty/UF-FGTG_code/data_analysis/output/sample_{id}_OURS_{img_name}.png"
    # image.save(output_path)
    
    # print(seed)
    return image


In [6]:
base_diffuser_model = "/data/mty/UF-FGTG_code/train/cache/models/models--stabilityai--stable-diffusion-2-1-base/snapshots/5ede9e4bf3e3fd1cb0ef2f7a3fff13ee514fdf06"
chinese_clip_model = "/data/mty/UF-FGTG_code/train/cache/models/models--OFA-Sys--chinese-clip-vit-base-patch16/snapshots/36e679e65c2a2fead755ae21162091293ad37834"

pipe = StableDiffusionPipeline.from_pretrained(base_diffuser_model, torch_dtype=torch.float16)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
# print(pipe.tokenizer)

model = ChineseCLIPModel.from_pretrained(chinese_clip_model)
# print("ChineseCLIP: ", model)
text_encoder = model.text_model
text_encoder = text_encoder.to(device)
# print("ChineseCLIP: ", text_encoder)
processor = ChineseCLIPProcessor.from_pretrained(chinese_clip_model)
# print(processor)
tokenizer = processor.tokenizer

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ChineseCLIPModel.from_pretrained("/data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/text_encoder")
text_encoder = model.text_model
text_encoder = text_encoder.to(device)

processor = ChineseCLIPProcessor.from_pretrained("/data/mty/UF-FGTG_code/train/cache/models/models--OFA-Sys--chinese-clip-vit-base-patch16/snapshots/36e679e65c2a2fead755ae21162091293ad37834"
)
# print(processor)
tokenizer = processor.tokenizer

prompt_ZH = "绵长蜿蜒的海滩，热带风情，明亮，简单"
prompt = pre_caption(prompt_ZH, 256)

embedding = text_encoder(tokenizer(prompt, return_tensors="pt").input_ids.to(device))[0]
print(embedding.size())

linear = MLP(input_dim=768, output_dim=1024).to(device)
state_dict = torch.load("/data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/linear/linear.bin")
new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} # 删除状态字典中键的 module. 前缀
linear.load_state_dict(new_state_dict)
# print(linear)

embedding = linear(embedding)
print(embedding.size())


You are using a model of type chinese_clip_text_model to instantiate a model of type chinese_clip. This is not supported for all configurations of models and can yield errors.


Some weights of ChineseCLIPModel were not initialized from the model checkpoint at /data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/text_encoder and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.token_type_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_mode

torch.Size([1, 20, 768])
torch.Size([1, 20, 1024])


In [9]:
print(prompt)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_ours = pipe_rum(2, tokenizer=pipe.tokenizer, prompt=prompt, pipe=pipe, text_encoder=pipe.text_encoder, g=g, device=device, negative_prompt="", height=512, width=512, step=50, cfg=7.5)
image_ours.show()


image_SD = pipe(prompt, generator = g).images[0]
image_SD = pipe(prompt, height=512,width=512,generator = g).images[0]
# image.save(f"/data/mty/UF-FGTG_code/data_analysis/output/sample_2_SD_fine.png")
image_SD.show()

绵长蜿蜒的海滩，热带风情，明亮，简单


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
print(prompt)
image_ours = pipe_rum(2, tokenizer=tokenizer, prompt=prompt, pipe=pipe, text_encoder=text_encoder, g=g, device=device, negative_prompt="", height=512, width=512, step=50, cfg=7.5)
image_ours.show()

绵长蜿蜒的海滩，热带风情，明亮，简单


RuntimeError: Device string must not be empty

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device)
attention_mask = (input_ids != tokenizer.pad_token_id).long()

max_length=77
weight_dtype = torch.float32

concat_embeds = []
for i in range(0, input_ids.shape[-1], max_length):
    concat_embeds.append(text_encoder(input_ids[:, i : i + max_length], attention_mask=attention_mask)[0])
    # neg_embeds.append(
    #     torch.cat([text_encoder(negative_ids[:, i : i + max_length], attention_mask=attention_mask_negative)[0], padding], dim=2))
prompt_embeds = torch.cat(concat_embeds, dim=1)

linear = MLP(input_dim=prompt_embeds.size()[-1], output_dim=1024, dtype=weight_dtype).to(device)
prompt_embeds  = linear(prompt_embeds).to(weight_dtype)
print("Prompt Embeddings:", prompt_embeds.size(), prompt_embeds.dtype)

Prompt Embeddings: torch.Size([1, 20, 1024]) torch.float32


In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device)
print(input_ids.size())
attention_mask = (input_ids != tokenizer.pad_token_id).long()
max_length = 77

concat_embeds = []
# neg_embeds = []
for i in range(0, input_ids.shape[-1], max_length):
    concat_embeds.append(text_encoder(input_ids[:, i : i + max_length], attention_mask=attention_mask)[0])
    # neg_embeds.append(
    #     torch.cat([text_encoder(negative_ids[:, i : i + max_length], attention_mask=attention_mask_negative)[0], padding], dim=2))
prompt_embeds = torch.cat(concat_embeds, dim=1)

print("Concat Embeddings: ", prompt_embeds.size())

    
# padding_input_ids = tokenizer([""] * 20, padding="max_length", max_length=1024 - 768, return_tensors="pt").input_ids
# # padding_input_ids = tokenizer([""] *input_ids.shape[-1], return_tensors="pt").input_ids
# padding_input_ids = padding_input_ids.to(device)
# print(padding_input_ids.size())

# attention_mask_padding = (padding_input_ids != tokenizer.pad_token_id).long().to(device)
# # print(attention_mask_padding, attention_mask_padding.size())
# # print("Padding Embeddings: ", padding_input_ids)
# # for i in range(0, padding_input_ids.shape[-1],max_length):
# #     concat_embeds_padding = text_encoder(padding_input_ids[:, i : i + max_length], attention_mask=attention_mask_padding)[0]
# # padding_embeds = torch.cat(concat_embeds_padding, dim=1)
# padding_embeds = text_encoder(padding_input_ids.to(device), attention_mask=attention_mask_padding)['last_hidden_state']
# print("Padding Embeddings: ", padding_embeds.size())



torch.Size([1, 20])
Concat Embeddings:  torch.Size([1, 20, 768])
torch.Size([20, 256])
Padding Embeddings:  torch.Size([20, 256, 768])


## Flan-T5-large

## stablediffusionapi/anything-midjourney

In [None]:
from diffusers import DiffusionPipeline

model_name = "/data/mty/UF-FGTG_code/train/cache/models--stablediffusionapi--anything-midjourney/snapshots/77716cc9746e86a5aff527860d228e1a7085c77c"

pipe = DiffusionPipeline.from_pretrained("model_name")


In [None]:

image_SD = pipe(prompt, generator = g).images[0]
image_SD = pipe(prompt, height=512,width=512,generator = g).images[0]
# image.save(f"/data/mty/UF-FGTG_code/data_analysis/output/sample_2_SD_fine.png")
image_SD.show()

# Modify the text encoder

In [3]:
import torch.nn as nn

class Adaptive_Text_Encoder(nn.Module):
    def __init__(self, text_encoder, linear):
        super(Adaptive_Text_Encoder, self).__init__()
        self.text_encoder = text_encoder
        # self.linear = MLP(input_dim=input_dim, output_dim=output_dim)  # 实例化linear
        self.linear = linear

    def forward(self, input_ids, attention_mask):
        return self.linear(self.text_encoder(input_ids, attention_mask=attention_mask)[0])

In [5]:
import torch
import torch
import torch.nn as nn
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
from transformers import ChineseCLIPProcessor, ChineseCLIPModel, ChineseCLIPConfig
from models import MLP
from dataset.utils import pre_caption

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ChineseCLIPModel.from_pretrained("/data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/text_encoder")
text_encoder = model.text_model
text_encoder = text_encoder.to(device)

processor = ChineseCLIPProcessor.from_pretrained("/data/mty/UF-FGTG_code/train/cache/models/models--OFA-Sys--chinese-clip-vit-base-patch16/snapshots/36e679e65c2a2fead755ae21162091293ad37834"
)
# print(processor)
tokenizer = processor.tokenizer

prompt_ZH = "绵长蜿蜒的海滩，热带风情，明亮，简单"
prompt = pre_caption(prompt_ZH, 256)

embedding = text_encoder(tokenizer(prompt, return_tensors="pt").input_ids.to(device))[0]
print(embedding.size())

linear = MLP(input_dim=768, output_dim=1024).to(device)
state_dict = torch.load("/data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/linear/linear.bin")
new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} # 删除状态字典中键的 module. 前缀
linear.load_state_dict(new_state_dict)
# print(linear)

embedding = linear(embedding)
print(embedding.size())

new_text_encoder = Adaptive_Text_Encoder(text_encoder, linear).to(device)
print(new_text_encoder)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
You are using a model of type chinese_clip_text_model to instantiate a model of type chinese_clip. This is not supported for all configurations of models and can yield errors.
Some weights of ChineseCLIPModel were not initialized from the model checkpoint at /data/mty/UF-FGTG_code/train/work_dir/ours_diffuser_gpt_test_ZH_entire_1e-4/pipeline-final/text_encoder and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.token_type_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.laye

torch.Size([1, 20, 768])
torch.Size([1, 20, 1024])
Adaptive_Text_Encoder(
  (text_encoder): ChineseCLIPTextModel(
    (embeddings): ChineseCLIPTextEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ChineseCLIPTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x ChineseCLIPTextLayer(
          (attention): ChineseCLIPTextAttention(
            (self): ChineseCLIPTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ChineseCLIPTextSelfOutput(
   

In [6]:
another_embedding = new_text_encoder(tokenizer(prompt, return_tensors="pt").input_ids.to(device), attention_mask=tokenizer(prompt, return_tensors="pt").attention_mask.to(device))
print(another_embedding.size())

torch.Size([1, 20, 1024])
