# Preparing environment

In [None]:
! pip install -r requirements.txt

# Scraping metadata

First, we need to scrape the metadata of the images. In the API provided by [Civitai](https://civitai.com/), the metadata is stored in paginated form, which we can access through the [models](https://github.com/civitai/civitai/wiki/REST-API-Reference#models) API. 


In [None]:
! python SUR_meta_spider.py

# Merging data

Since the scraped metadata is stored in JSON files, we then need to clean and merge the data. We store the processed data in `data.json`.

In [None]:
import json
import glob

file_list = glob.glob("data/meta-*.json")
data = {"metadata": {"totalItems": 0, "totalPages": 0}, "items": []}

for i in file_list:
    with open(i, "r") as f:
        data["items"].extend(json.load(f)["items"])
        
data["metadata"]["totalPages"] = len(file_list)
data["metadata"]["totalItems"] = len(data["items"])

print('totalPages', data["metadata"]["totalPages"])  
print('totalItems', data["metadata"]["totalItems"])  


image_data = {"metadata": {"totalItems": 0, "imageItems": 0, "repetitionItems": 0, "ReDupItems": 0}, "items": []}
count = 0
for i in data["items"]:
    image_lists = i["modelVersions"]
    for image_list in image_lists:
        for k in image_list["images"]:
            count += 1
            try:
                new_k = {}
                new_k["url"] = k["url"]
                new_k["prompt"] = k["meta"]["prompt"]
                try:
                    new_k["negativePrompt"] = k["meta"]["negativePrompt"]
                except:
                    new_k["negativePrompt"] = ""
                    
                image_data["items"].append(new_k)
            except Exception as e:
                continue
image_data["metadata"]["totalItems"] = count
image_data["metadata"]["imageItems"] = len(image_data["items"])
print('totalImages', image_data["metadata"]["totalItems"])  
print('totalImages with prompt', image_data["metadata"]["imageItems"])  


url_dict = {}
url_total = {}
origin_image_item = image_data["items"]
image_data["items"] = []

for i in origin_image_item:
    url_list = i["url"].split("/")
    image_name = url_list[-3]
    if url_dict.get(image_name, -1) == -1:
        i["image_name"] = image_name
        image_data["items"].append(i)
        url_dict[image_name] = 1
        url_total[image_name] = [i]
    else:
        url_total[image_name].append(i)
        
count = 0 
for k, v in url_total.items():
    if len(v) != 1:
        count += 1

image_data["metadata"]["repetitionItems"] = count
image_data["metadata"]["ReDupItems"] = len(image_data["items"])
print("repetition count", count) 
print("totalImages after deduplication", image_data["metadata"]["ReDupItems"])

with open("data/data.json", "w") as f:
    json.dump(image_data, f)
    
image_data["items"][0]

# Scraping images

Due to the very large total number of images, we highly recommend that if users intend to scrape all the images, they should consider modifying the code to implement multiprocessing for the scraping process.

In [None]:
! python SUR_image_spider.py

# Generating captions for images

If users are generating captions for a large number of images, we highly recommend that they modify the code to support batch processing and multi-GPU parallel mode.

In [None]:
from lavis.models import load_model_and_preprocess
from PIL import Image
import torch
import json
import warnings
warnings.filterwarnings("ignore")

cuda = 0
device = torch.device(f"cuda:{cuda}" if torch.cuda.is_available() else "cpu")
blip_model, vis_processors, _ = load_model_and_preprocess(name="blip_caption", model_type="base_coco", is_eval=True, device=device)


with open("data/data.json", "r") as f:
    data = json.load(f)["items"]
print(len(data))

with open(f"data/metadata.jsonl", "w") as f: 
    for i in range(len(data)):
        raw_image = Image.open(f"image/{data[i]['image_name']}.png").convert("RGB")
        image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
        caption = blip_model.generate({"image": image})[0]
        json.dump({"file_name": data[i]['image_name'], "text": caption, "prompt": data[i]['prompt']}, f)
        f.write('\n')



# Generating the vector of `text`

The final step of data processing involves extracting the vector for the `text` (image caption) from [llama](https://github.com/facebookresearch/llama). For the processing workflow, please refer to the [Prompt2vec](https://github.com/Qrange-group/SUR-adapter#-prompt2vec).