# BLIP-2
This notebook uses BLIP-2 for advanced captioning. It requires google drive access. It's a huge improvement from BLIP for photoreal images. Haven't tested on anime.

Setting the torch download directory currently does nothing. It downloads locally. I think. 

If run on Google collab, this requires a 'premium GPU' instance type. It uses 22GB of ram at peak consumption, 38GB vram and about 40GB of hard disk space. BLIP-2 is a heavy model. You need to be using a 'premium' gpu collab instance.

It requires about 0.5s/image to annotate and costs about \$1.30/hr to run, plus about 5 minutes to load the models.There are 3300 seconds in 55 minutes, you can annotate roughly 1000 images for \$1.30. 

It overwrites any existing .caption files found. 
It outputs captions to the same dir that the images exist in. 


In [None]:
# @markdown Install Dependencies
!pip install pillow
!pip install transformers[Torch]
!pip install accelerate

In [None]:
# @markdown Load Processor
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", offload_state_dict=True)


In [None]:
# @markdown Load Model
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, offload_state_dict=True
)
model.to(device, torch.float16)



In [None]:
# @title Set Working Directory
# @markdown Models take a LONG time to download, so download them to google drive
import os
import os
import zipfile
import shutil
import time
from subprocess import getoutput
from IPython.utils import capture
from google.colab import drive

dataset_name = '' #@param {type:'string'}
train_data_dir = f'/content/drive/MyDrive/Loras/{dataset_name}/dataset'

if not os.path.exists("/content/drive"):
    drive.mount("/content/drive")
assert(os.path.exists(train_data_dir))

blip2_model_download_dir = '/content/drive/MyDrive/torch' #@param {type:'string'}
input_images_dir = '/content/drive/MyDrive/'
output_captions_dir = '/content/drive/MyDrive/'
if not os.path.exists(blip2_model_download_dir):
  os.makedirs(blip2_model_download_dir)
  
# os.environ['TORCH_HOME'] = blip2_model_download_dir

In [None]:
# @markdown generate Annotations Vectors
min_length = 20 # @param {'type':'integer'}
max_length = 40 # @param {'type':'integer'}
top_p = 0.9 # @param {'type':'number'}

def is_image(x):
  ext = x.split('.')[-1]
  if ext.lower() in ['bmp','png','jpg','jpeg']:
    return True
  return False

from PIL import Image
image_names = os.listdir(train_data_dir)
image_names = [x for x in image_names if is_image(x)]
all_inputs = []
all_text = []
print(image_names)

for i, name in enumerate(image_names):
  fullname = os.path.join(train_data_dir, name)
  image = Image.open(fullname)
    
  inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
  generated_ids = model.generate(**inputs, min_length=20, max_length=50, top_p=0.9, repetition_penalty=2.0, num_beams=1)
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
  base_filename = name.split('.')[0] # remove the extension
  with open(os.path.join(train_data_dir, f'{base_filename}.caption'),'w+', encoding='utf-8') as fp:
    fp.write(generated_text)
    print(f'{i}:{name}:{generated_text}')

In [None]:
if "step1_installed_flag" not in globals():
  raise Exception("Please run step 1 first!")
  
#@markdown ### 5️⃣ Curate your tags
#@markdown Modify your dataset's tags. You can run this cell multiple times with different parameters. <p>

#@markdown Put an activation tag at the start of every text file. This is useful to make learning better and activate your Lora easier. Set `keep_tokens` to 1 when training.<p>
#@markdown Common tags that are removed such as hair color, etc. will be "absorbed" by your activation tag.
global_activation_tag = "" #@param {type:"string"}
remove_tags = "" #@param {type:"string"}
#@markdown &nbsp;

#@markdown In this advanced section, you can search text files containing matching tags, and replace them with less/more/different tags. If you select the checkbox below, any extra tags will be put at the start of the file, letting you assign different activation tags to different parts of your dataset. Still, you may want a more advanced tool for this.
search_tags = "" #@param {type:"string"}
replace_with = "" #@param {type:"string"}
search_mode = "OR" #@param ["OR", "AND"]
new_becomes_activation_tag = False #@param {type:"boolean"}
#@markdown These may be useful sometimes. Will remove existing activation tags, be careful.
sort_alphabetically = False #@param {type:"boolean"}
remove_duplicates = False #@param {type:"boolean"}

def split_tags(tagstr):
  return [s.strip() for s in tagstr.split(",") if s.strip()]

activation_tag_list = split_tags(global_activation_tag)
remove_tags_list = split_tags(remove_tags)
search_tags_list = split_tags(search_tags)
replace_with_list = split_tags(replace_with)
replace_new_list = [t for t in replace_with_list if t not in search_tags_list]

replace_with_list = [t for t in replace_with_list if t not in replace_new_list]
replace_new_list.reverse()
activation_tag_list.reverse()

remove_count = 0
replace_count = 0

for txt in [f for f in os.listdir(images_folder) if f.lower().endswith(".txt")]:

  with open(os.path.join(images_folder, txt), 'r') as f:
    tags = [s.strip() for s in f.read().split(",")]

  if remove_duplicates:
    tags = list(set(tags))
  if sort_alphabetically:
    tags.sort()

  for rem in remove_tags_list:
    if rem in tags:
      remove_count += 1
      tags.remove(rem)

  if "AND" in search_mode and all(r in tags for r in search_tags_list) \
      or "OR" in search_mode and any(r in tags for r in search_tags_list):
    replace_count += 1
    for rem in search_tags_list:
      if rem in tags:
        tags.remove(rem)
    for add in replace_with_list:
      if add not in tags:
        tags.append(add)
    for new in replace_new_list:
      if new_becomes_activation_tag:
        if new in tags:
          tags.remove(new)
        tags.insert(0, new)
      else:
        if new not in tags:
          tags.append(new)

  for act in activation_tag_list:
    if act in tags:
      tags.remove(act)
    tags.insert(0, act)

  with open(os.path.join(images_folder, txt), 'w') as f:
    f.write(", ".join(tags))

if remove_tags:
  print(f"\n🚮 Removed {remove_count} tags.")
if search_tags:
  print(f"\n💫 Replaced in {replace_count} files.")
print("\n✅ Done!")


In [None]:
#@title ## 4.3. Merge Annotation Into JSON 
import os
%store -r

os.chdir(finetune_dir)

#@markdown Cleaning tags and captions, then merges them into a single JSON file, which will be used as the input for the bucketing section.
meta_clean = f"/content/drive/MyDrive/Loras/{dataset_name}/meta_clean.json"
parent_folder = os.path.dirname(meta_clean)
meta_cap_dd = f"{parent_folder}/meta_cap_dd.json"
meta_cap = f"{parent_folder}/meta_cap.json"

os.makedirs(parent_folder, exist_ok=True)

if os.path.isdir(train_data_dir):
  if any(file.endswith('.caption') for file in os.listdir(train_data_dir)):
    !python merge_captions_to_metadata.py \
      {train_data_dir} \
      {meta_cap}

  if any(file.endswith('.txt') for file in os.listdir(train_data_dir)):
    !python merge_dd_tags_to_metadata.py \
      {train_data_dir} \
      {meta_cap_dd}
else:
  print("train_data_dir does not exist or is not a directory.")

if os.path.exists(meta_cap):
  !python merge_dd_tags_to_metadata.py \
    {train_data_dir} \
    --in_json {meta_cap} \
    {meta_cap_dd}

if os.path.exists(meta_cap_dd):
  !python clean_captions_and_tags.py \
    {meta_cap_dd} \
    {meta_clean}
elif os.path.exists(meta_cap):
  !python clean_captions_and_tags.py \
    {meta_cap} \
    {meta_clean}