# The notebook for training the text to image model

## Package Preparation

### Import packages

In [2]:
# !pip install -q datasets
# !pip install -q transformers
# !pip install -q accelerate
# !pip install -q git+https://github.com/huggingface/diffusers

In [3]:
import logging
import math
import os
import random
import glob
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset, Dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
from torchvision import transforms
from tqdm.notebook import tqdm
from transformers import CLIPTextModel, CLIPTokenizer

import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
from diffusers.loaders import AttnProcsLayers
from diffusers.models.attention_processor import LoRAAttnProcessor
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available

### Check diffuser version & Save model card

In [4]:
check_min_version("0.16.0.dev0")

logger = get_logger(__name__, log_level="INFO")

In [5]:
def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
    img_str = ""
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"![img_{i}](./image_{i}.png)\n"

    yaml = f"""
---
license: creativeml-openrail-m
base_model: {base_model}
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
inference: true
---
    """
    model_card = f"""
# LoRA text2image fine-tuning - {repo_id}
These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
{img_str}
"""
    with open(os.path.join(repo_folder, "README.md"), "w") as f:
        f.write(yaml + model_card)

## Set Basic Arguments

### Saving Directory

In [6]:
#@markdown If model weights should be saved directly in google drive (takes around 4-5 GB).
save_to_gdrive = False #@param {type:"boolean"}
if save_to_gdrive:
    from google.colab import drive
    drive.mount('/content/drive')

#@markdown Name/Path of the initial model.
pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5" #@param {type:"string"}

#@markdown Enter the directory name to save model at.

# output_dir = "ml_stable_diffusion_weights/lora" #@param {type:"string"}
# if save_to_gdrive:
#     output_dir = "/content/drive/MyDrive/" + output_dir
# else:
#     output_dir = "/content/" + output_dir

# print(f"[*] Weights will be saved at {output_dir}")

# !mkdir -p $output_dir
output_dir = "lora_output"

### Configure Accelerator

In [7]:
logging_dir = os.path.join(output_dir, "logs")
accelerator_project_config = ProjectConfiguration(total_limit=None)

accelerator = Accelerator(
        gradient_accumulation_steps=1,
        mixed_precision="fp16",
        log_with="tensorboard",
        logging_dir=logging_dir,
        project_config=accelerator_project_config,
    )



### Handle Repository Creation

In [8]:
if accelerator.is_main_process:
        if output_dir is not None:
            os.makedirs(output_dir, exist_ok=True)

### Load scheduler, tokenizer, models

In [9]:
noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
tokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path, subfolder="tokenizer", revision=None
)
text_encoder = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder", revision=None
)
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", revision=None)
unet = UNet2DConditionModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="unet", revision=None
)
# freeze parameters of models to save more memory
unet.requires_grad_(False)
vae.requires_grad_(False)

text_encoder.requires_grad_(False)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [10]:
weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16":
    weight_dtype = torch.float16
elif accelerator.mixed_precision == "bf16":
    weight_dtype = torch.bfloat16

### Move unet, vae, text_encoder to device

In [11]:
print(accelerator.device)

cuda


In [12]:
unet.to(accelerator.device, dtype=weight_dtype)
vae.to(accelerator.device, dtype=weight_dtype)
text_encoder.to(accelerator.device, dtype=weight_dtype)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

## Start adding LoRA weights to attention layers

    # It's important to realize here how many attention weights will be added and of which sizes
    # The sizes of the attention layers consist only of two different variables:
    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.

    # Let's first see how many attention processors we will have to set.
    # For Stable Diffusion, it should be equal to:
    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
    # => 32 layers

### Set correct lora layers

In [13]:
lora_attn_procs = {}
for name in unet.attn_processors.keys():
  cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
  # print(name)
  if name.startswith("mid_block"):
    # print(unet.config.block_out_channels)
    hidden_size = unet.config.block_out_channels[-1]
  elif name.startswith("up_blocks"):
    block_id = int(name[len("up_blocks.")])
    hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
    # print(hidden_size)
  elif name.startswith("down_blocks"):
    block_id = int(name[len("down_blocks.")])
    hidden_size = unet.config.block_out_channels[block_id]
    # print(hidden_size)

  lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)

unet.set_attn_processor(lora_attn_procs)
lora_layers = AttnProcsLayers(unet.attn_processors)


### Initalize optimizers

In [14]:
#@markdown Parameters for adamW

optimizer_cls = torch.optim.AdamW
learning_rate = 1e-4 #@param {type:"number"}
adam_beta1 = 0.9 #@param {type:"number"}
adam_beta2 = 0.999 #@param {type:"number"}
adam_weight_decay = 1e-2 #@param {type:"number"}
adam_epsilon = 1e-08 #@param {type:"number"}

In [15]:
optimizer = optimizer_cls(
    lora_layers.parameters(),
    lr=learning_rate,
    betas=(adam_beta1,adam_beta2),
    weight_decay=adam_weight_decay,
    eps=adam_epsilon,
)

### Load Quickdraw Dataset

#### Read the class name

In [16]:
# !wget 'https://raw.githubusercontent.com/zaidalyafeai/zaidalyafeai.github.io/master/sketcher/mini_classes.txt'

In [17]:
f = open("mini_classes.txt","r")
# And for reading use
classes = f.readlines()
f.close()

In [18]:
classes = [c.replace('\n','').replace(' ','_') for c in classes]
print(len(classes))

100


#### Download Data

In [19]:
# !mkdir data

In [20]:
import urllib.request
from tqdm.auto import tqdm
def download():
    base = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/'
    for c in tqdm(classes):        
        cls_url = c.replace('_', '%20')
        path = base+cls_url+'.npy'
        # print(path)
        urllib.request.urlretrieve(path, 'data/'+c+'.npy')

In [21]:
# download()

#### load the data

In [22]:
def load_data_for_diffusion(root, max_items_per_class= 4000 ):
    all_files = glob.glob(os.path.join(root, '*.npy'))

    #initialize variables
    imgs = np.empty([0, 784])
    labels = []

    for idx, file in enumerate(all_files):
      data = np.load(file)
      data = data[0: max_items_per_class, :]

      class_name, ext = os.path.splitext(os.path.basename(file))
      labels.extend(["a scribble of " + class_name for i in range(data.shape[0])])

      imgs = np.concatenate((imgs, data), axis=0)


    return imgs, labels
    

In [23]:
imgs, labels = load_data_for_diffusion('data')

In [24]:
from PIL import Image
def gen_for_hf(imgs, labels):
    for idx in range(len(imgs)):
        img = Image.fromarray(imgs[idx].reshape(28,28))
        label = labels[idx]
        yield {"image": img, "text": label}

In [25]:
scribble_dataset = Dataset.from_generator(gen_for_hf, gen_kwargs={"imgs": imgs, "labels": labels})

Found cached dataset generator (/home/yg2709/.cache/huggingface/datasets/generator/default-5726a1d654ce8249/0.0.0)


#### Tokenize labels

In [26]:
def tokenize_captions(examples):
    captions = []
    for caption in examples["text"]:
        if isinstance(caption, str):
            captions.append(caption)
        elif isinstance(caption, (list, np.ndarray)):
            # take a random caption if there are multiple
            captions.append(random.choice(caption) if is_train else caption[0])
            
    inputs = tokenizer(
            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
    return inputs.input_ids

#### Preprocess Images

In [27]:
train_transforms = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ]
)

In [28]:
def preprocess_train(examples):
    images = [image.convert("RGB") for image in examples["image"]]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    examples["input_ids"] = tokenize_captions(examples)
    return examples

In [29]:
train_dataset = scribble_dataset.with_transform(preprocess_train)

#### DataLoaders Creation

In [30]:
def collate_fn(examples):
    pixel_values = torch.stack([example['pixel_values'] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    input_ids = torch.stack([example['input_ids'] for example in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids}
    

In [31]:
train_batch_size=16

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=train_batch_size,
    num_workers=0,
)

### Training Preparations

#### Scheduler and math around the number of training steps

In [32]:
gradient_accumulation_steps = 1
num_train_epochs = 100
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch
lr_warmup_steps = 500

In [33]:
lr_scheduler = "constant"
lr_scheduler = get_scheduler(
    lr_scheduler,
    optimizer=optimizer,
    num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
    num_training_steps=max_train_steps * gradient_accumulation_steps,
)

#### prepare everything with accelerator

In [34]:
lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        lora_layers, optimizer, train_dataloader, lr_scheduler
    )

#### Recalculate our total training steps as the size of the training dataloader may have changed

In [35]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch

### Train

In [36]:
total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(labels)}")
logger.info(f"  Num Epochs = {num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
logger.info(f"  Total optimization steps = {max_train_steps}")

global_step = 0
first_epoch = 0

seed = 1337

In [None]:
progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
progress_bar.set_description("Steps")

checkpointing_steps = 500
validation_epochs = 100
num_validation_images = 1
max_grad_norm = 1.0

for epoch in range(first_epoch, num_train_epochs):
    unet.train()
    train_loss = 0.0
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(unet):
            # convert images to latent space
            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
            latents = latents * vae.config.scaling_factor
            
            # sample noise that we'll add to the latents
            noise = torch.randn_like(latents)
            bsz = latents.shape[0]
            
            # Sample a random timestep for each image
            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
            timesteps = timesteps.long()
            
            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
            
            # Get the text embedding for conditioning
            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
            
            # Get the target for loss depending on the prediction type
            if noise_scheduler.config.prediction_type == "epsilon":
                target = noise
            elif noise_scheduler.config.prediction_type == "v_prediction":
                target = noise_scheduler.get_velocity(latents, noise, timesteps)
            else:
                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                
            
            # Predict the noise residual and compute loss
            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
            loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
            
            # Gather the losses across all processes
            avg_loss = accelerator.gather(loss.repeat(train_batch_size)).mean()
            train_loss += avg_loss.item() / gradient_accumulation_steps
            
            # Backpropagate
            accelerator.backward(loss)
            if accelerator.sync_gradients:
                params_to_clip = lora_layers.parameters()
                accelerator.clip_grad_norm_(params_to_clip, max_grad_norm)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
            # update procress bar
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1
#                 accelerator.log({"train_loss":  train_loss}, step=global_step)
                train_loss = 0.0
                
                if global_step % checkpointing_steps == 0:
                    if accelerator.is_main_process:
                        save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
                        accelerator.save_state(save_path)
                        logger.info(f"Saved state to {save_path}")
            
            print(f"global_step: {global_step}, step_loss: {loss.detach().item()}, lr: {lr_scheduler.get_last_lr()[0]}")
            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            
            if global_step >= max_train_steps:
                break
        
        if accelerator.is_main_process:
            validation_prompt = "a scribble of horse"
            if epoch % validation_epochs == 0:
                logger.info(
                    f"Running validation... \n Generating {num_validation_images} images with prompt:"
                    f" {validation_prompt}."
                )
                
                # create pipeline
                pipeline = DiffusionPipeline.from_pretrained(
                    pretrained_model_name_or_path,
                    unet=accelerator.unwrap_model(unet),
                    revision=None,
                    torch_dtype=weight_dtype,
                )
                
                pipeline = pipeline.to(accelerator.device)
                pipeline.set_progress_bar_config(disable=True)
                
                # run inference
                generator = torch.Generator(device=accelerator.device).manual_seed(seed)
                images = []
                for _ in range(num_validation_images):
                    images.append(
                        pipeline(validation_prompt, num_inference_steps=30, generator=generator).images[0]
                    )
                    
                del pipeline
                torch.cuda.empty_cache()

# save lora layers
accelerator.wait_for_everyone()
if accelerator.is_main_process:
    unet = unet.to(torch.float32)
    unet.save_attn_procs(output_dir)

# final inference
pipeline = DiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path, revision=None, torch_dtype=weight_dtype
)

pipeline = pipeline.to(accelerator.device)

# load attention processors
pipeline.unet.load_attn_procs(output_dir)

# run inference
generator = torch.Generator(device=accelerator.device).manual_seed(seed)
images = []

for _ in range(num_validation_images):
    images.append(pipeline(validation_prompt, num_inference_steps=30, generator=generator).images[0])
    
accelerator.end_training()


  0%|          | 0/2500000 [00:00<?, ?it/s]

global_step: 1, step_loss: 0.52974933385849, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 2, step_loss: 0.3814193308353424, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 3, step_loss: 0.3743318021297455, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 4, step_loss: 0.3684229254722595, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 5, step_loss: 0.4837562143802643, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 6, step_loss: 0.4790908992290497, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 7, step_loss: 0.3986286222934723, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 8, step_loss: 0.3352173864841461, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 9, step_loss: 0.35469546914100647, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 10, step_loss: 0.4134896993637085, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 11, step_loss: 0.4479493200778961, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 12, step_loss: 0.3156116306781769, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 13, step_loss: 0.30999135971069336, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 14, step_loss: 0.3117743134498596, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 15, step_loss: 0.2867673337459564, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 16, step_loss: 0.25860270857810974, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 17, step_loss: 0.2974352240562439, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 18, step_loss: 0.5138920545578003, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 19, step_loss: 0.3241399824619293, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 20, step_loss: 0.2528870403766632, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 21, step_loss: 0.32042041420936584, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 22, step_loss: 0.5460814237594604, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 23, step_loss: 0.43121328949928284, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 24, step_loss: 0.25599047541618347, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 25, step_loss: 0.36573103070259094, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 26, step_loss: 0.47078052163124084, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 27, step_loss: 0.34885352849960327, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 28, step_loss: 0.3196282982826233, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 29, step_loss: 0.2990761995315552, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 30, step_loss: 0.3655814230442047, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 31, step_loss: 0.35368189215660095, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 32, step_loss: 0.2551049292087555, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 33, step_loss: 0.39433425664901733, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 34, step_loss: 0.2983705699443817, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 35, step_loss: 0.25776925683021545, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 36, step_loss: 0.30165913701057434, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 37, step_loss: 0.2652347683906555, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 38, step_loss: 0.25184816122055054, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 39, step_loss: 0.14942391216754913, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 40, step_loss: 0.3801753520965576, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 41, step_loss: 0.25541749596595764, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 42, step_loss: 0.49452176690101624, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 43, step_loss: 0.3629019260406494, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 44, step_loss: 0.34574922919273376, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 45, step_loss: 0.23280484974384308, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 46, step_loss: 0.29404884576797485, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 47, step_loss: 0.16487427055835724, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 48, step_loss: 0.4021977186203003, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 49, step_loss: 0.3791680932044983, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 50, step_loss: 0.20474191009998322, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 51, step_loss: 0.2497936338186264, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 52, step_loss: 0.18877777457237244, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 53, step_loss: 0.275510311126709, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 54, step_loss: 0.33144664764404297, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 55, step_loss: 0.24595727026462555, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 56, step_loss: 0.41139304637908936, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 57, step_loss: 0.1931467056274414, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 58, step_loss: 0.31094449758529663, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 59, step_loss: 0.2899482548236847, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 60, step_loss: 0.30175167322158813, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 61, step_loss: 0.1966894268989563, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 62, step_loss: 0.27722740173339844, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 63, step_loss: 0.2507131099700928, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 64, step_loss: 0.24604040384292603, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 65, step_loss: 0.2435561567544937, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 66, step_loss: 0.36081463098526, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 67, step_loss: 0.3413030207157135, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 68, step_loss: 0.1917014718055725, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 69, step_loss: 0.4362647831439972, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 70, step_loss: 0.17234942317008972, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 71, step_loss: 0.27897289395332336, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 72, step_loss: 0.266463965177536, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 73, step_loss: 0.4411681592464447, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 74, step_loss: 0.25414493680000305, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 75, step_loss: 0.28698641061782837, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 76, step_loss: 0.1803690642118454, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 77, step_loss: 0.2938162684440613, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 78, step_loss: 0.19896768033504486, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 79, step_loss: 0.22977808117866516, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 80, step_loss: 0.16777080297470093, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 81, step_loss: 0.2155599147081375, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 82, step_loss: 0.209578275680542, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 83, step_loss: 0.24694758653640747, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 84, step_loss: 0.2821265757083893, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 85, step_loss: 0.3504321873188019, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 86, step_loss: 0.18950971961021423, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 87, step_loss: 0.35168540477752686, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 88, step_loss: 0.2671871483325958, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 89, step_loss: 0.432254433631897, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 90, step_loss: 0.18840989470481873, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 91, step_loss: 0.24823035299777985, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 92, step_loss: 0.3056960105895996, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 93, step_loss: 0.2667634189128876, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 94, step_loss: 0.11982332170009613, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 95, step_loss: 0.260465532541275, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 96, step_loss: 0.2780289947986603, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 97, step_loss: 0.2637275457382202, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 98, step_loss: 0.15437987446784973, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 99, step_loss: 0.2243853211402893, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 100, step_loss: 0.1648409515619278, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 101, step_loss: 0.3855959177017212, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 102, step_loss: 0.30214715003967285, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 103, step_loss: 0.20578886568546295, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 104, step_loss: 0.2786537706851959, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 105, step_loss: 0.1727059930562973, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 106, step_loss: 0.2615479826927185, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 107, step_loss: 0.29357874393463135, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 108, step_loss: 0.14179812371730804, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 109, step_loss: 0.2606171667575836, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 110, step_loss: 0.18360498547554016, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 111, step_loss: 0.30401188135147095, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 112, step_loss: 0.2514961361885071, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 113, step_loss: 0.2063683718442917, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 114, step_loss: 0.19438958168029785, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 115, step_loss: 0.17791710793972015, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 116, step_loss: 0.25345098972320557, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 117, step_loss: 0.3120306432247162, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 118, step_loss: 0.17740391194820404, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 119, step_loss: 0.22307640314102173, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 120, step_loss: 0.1944839507341385, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 121, step_loss: 0.245819091796875, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 122, step_loss: 0.2673543095588684, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 123, step_loss: 0.3601320683956146, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 124, step_loss: 0.23165297508239746, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 125, step_loss: 0.18146541714668274, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 126, step_loss: 0.2616788446903229, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 127, step_loss: 0.31650710105895996, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 128, step_loss: 0.29935142397880554, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 129, step_loss: 0.30722421407699585, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 141, step_loss: 0.21247579157352448, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 142, step_loss: 0.16184134781360626, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 143, step_loss: 0.2259223759174347, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 144, step_loss: 0.34734025597572327, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 145, step_loss: 0.23367561399936676, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 146, step_loss: 0.30495119094848633, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 147, step_loss: 0.375996470451355, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 148, step_loss: 0.3053724765777588, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 149, step_loss: 0.30946484208106995, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 150, step_loss: 0.18543440103530884, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 151, step_loss: 0.32246294617652893, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 152, step_loss: 0.2802824378013611, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 153, step_loss: 0.19783946871757507, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 154, step_loss: 0.16010163724422455, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 155, step_loss: 0.11871157586574554, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 156, step_loss: 0.2180098444223404, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 157, step_loss: 0.32783618569374084, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 158, step_loss: 0.16241993010044098, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 159, step_loss: 0.3370039761066437, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 160, step_loss: 0.27323293685913086, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 161, step_loss: 0.31474730372428894, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 162, step_loss: 0.30353844165802, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 163, step_loss: 0.2624572217464447, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 164, step_loss: 0.2889457941055298, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 165, step_loss: 0.2233557254076004, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 166, step_loss: 0.3111274838447571, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 167, step_loss: 0.24343343079090118, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 168, step_loss: 0.15057767927646637, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 169, step_loss: 0.26032692193984985, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 170, step_loss: 0.21952731907367706, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 171, step_loss: 0.2393750697374344, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 172, step_loss: 0.2602710425853729, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 173, step_loss: 0.2306673526763916, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 174, step_loss: 0.16212324798107147, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 175, step_loss: 0.16055995225906372, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 176, step_loss: 0.3451288640499115, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 177, step_loss: 0.16703253984451294, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 178, step_loss: 0.2200564593076706, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 179, step_loss: 0.14943332970142365, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 180, step_loss: 0.3396618664264679, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 181, step_loss: 0.13699443638324738, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 182, step_loss: 0.16717299818992615, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 183, step_loss: 0.31362438201904297, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 184, step_loss: 0.22855530679225922, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 185, step_loss: 0.16615575551986694, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 186, step_loss: 0.18606765568256378, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 187, step_loss: 0.29501745104789734, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 188, step_loss: 0.227791890501976, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 189, step_loss: 0.19661781191825867, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 190, step_loss: 0.2564595639705658, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 191, step_loss: 0.1943182647228241, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 192, step_loss: 0.26097679138183594, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 193, step_loss: 0.21109521389007568, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 194, step_loss: 0.2673303186893463, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 195, step_loss: 0.3968947231769562, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 196, step_loss: 0.2343352884054184, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 197, step_loss: 0.19622445106506348, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 198, step_loss: 0.23034636676311493, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 199, step_loss: 0.18811586499214172, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


global_step: 200, step_loss: 0.2567027509212494, lr: 0.0001


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
