# 1. Metadata

In [5]:
import pandas as pd
from pathlib import Path
import pickle

from collections import Counter
from tqdm.notebook import tqdm
import unicodedata

In [6]:
train_data_dir = Path("../input/")

metadata_2m = pd.read_parquet(train_data_dir / "DiffusionDB_2M/metadata.parquet")
metadata_2m["image_name"] = "DiffusionDB_2M/" + metadata_2m["image_name"]
metadata_2m["prompt"] = metadata_2m["prompt"].str.lower().str.strip()
metadata_2m = metadata_2m[["image_name", "prompt", "height", "width"]]

metadata_sd2_v2 = pd.read_parquet(train_data_dir / "gustavosta-sd2-v2/metadata.parquet")
metadata_sd2_v2["image_name"] = "gustavosta-sd2-v2/" + metadata_sd2_v2["image_name"]
metadata_sd2_v2["height"] = 512
metadata_sd2_v2["width"] = 512
metadata_sd2_v2["prompt"] = metadata_sd2_v2["prompt"].str.lower().str.strip()

metadata_sd3 = pd.read_csv(train_data_dir / "sd3/metadata.csv")
metadata_sd3["image_name"] = "sd3/" + metadata_sd3["image_path"]
del metadata_sd3["image_path"]
metadata_sd3["height"] = 512
metadata_sd3["width"] = 512
metadata_sd3["prompt"] = metadata_sd3["prompt"].str.lower().str.strip()

metadata = pd.concat([metadata_2m, metadata_sd2_v2, metadata_sd3], ignore_index=True)
metadata

Unnamed: 0,image_name,prompt,height,width
0,DiffusionDB_2M/2217ccbd-a1c6-47ac-9a2d-7964972...,"a portrait of a female robot made from code, v...",512,512
1,DiffusionDB_2M/c78807b7-d55a-4a2d-a6b6-9192b18...,a portrait of a female robot made from a cloud...,512,512
2,DiffusionDB_2M/dc71658a-5e4b-4dca-861a-e153551...,"only memories remain, trending on artstation",512,512
3,DiffusionDB_2M/48eb7e17-a3cf-4eb8-96a9-d8e3e23...,dream swimming pool with nobody,512,512
4,DiffusionDB_2M/601d9792-eccd-4850-97a7-edbe91d...,a dog doing weights. epic oil painting.,768,512
...,...,...,...,...
2111934,sd3/artifacts/sd-img-to-prompts:v29/00996.png,a greenhouse with deep green and purple glowin...,512,512
2111935,sd3/artifacts/sd-img-to-prompts:v29/00997.png,japanese female idol,512,512
2111936,sd3/artifacts/sd-img-to-prompts:v29/00998.png,1 9 2 0 s color spirit photography 0 9 1 1 2 1...,512,512
2111937,sd3/artifacts/sd-img-to-prompts:v29/00999.png,gary busey doing a sweet skateboard trick off ...,512,512


### Prompts interseption 

In [7]:
prompts_2m = set(metadata_2m["prompt"].tolist())
prompts_sd2 = set(metadata_sd2_v2["prompt"].tolist())
prompts_sd3 = set(metadata_sd3["prompt"].tolist())

In [8]:
print(len(prompts_2m.intersection(prompts_sd2)) / len(prompts_sd2))
print(len(prompts_2m.intersection(prompts_sd3)) / len(prompts_sd3))
print(len(prompts_sd2.intersection(prompts_sd3)) / min(len(prompts_sd3), len(prompts_sd2)))

0.8003071738514531
0.8108973243798059
0.0663164120942208


## Create new prompts 

### First sent

In [9]:
metadata["prompt_first"] = metadata["prompt"].str.split(",").str[0]

### Max len sent 

In [10]:
metadata["prompt_max"] = (
    metadata["prompt"].apply(lambda a: sorted([(len(x), x) for x in str(a).split(",")])[-1][1])
)

### Clear key words

In [11]:
metadata["prompt_clear"] = metadata["prompt"].str.replace(".", ",").str.strip(" ,.") + ","
prompts = metadata["prompt_clear"].astype(str).tolist()

general_prompt = ", ".join(prompts)

counter = Counter()
words = general_prompt.split(", ")
for word in words:
    counter[word] += 1
counter.most_common()[:10]

  metadata["prompt_clear"] = metadata["prompt"].str.replace(".", ",").str.strip(" ,.") + ","


[('highly detailed', 240743),
 ('artstation', 201017),
 ('sharp focus', 196500),
 ('concept art', 194256),
 ('digital painting', 149945),
 ('intricate', 148980),
 ('illustration', 136630),
 ('trending on artstation', 127137),
 ('octane render', 120061),
 ('smooth', 113081)]

In [12]:
key_words = []
for word, cnt in counter.most_common():
    if cnt / len(prompts) > 0.001:
        key_words.append(word)
len(key_words)

576

In [15]:
for word in tqdm(key_words):
    metadata["prompt_clear"] = (
        metadata["prompt_clear"].str.replace(f", {word},", ",")
    )
metadata["prompt_clear"] = metadata["prompt_clear"].str.strip(", ")

  0%|          | 0/576 [00:00<?, ?it/s]

### Process 

In [16]:
metadata["prompt_first"] = metadata["prompt_first"].str.strip(",. ")
metadata["prompt_max"] = metadata["prompt_max"].str.strip(",. ")
metadata["prompt_clear"] = metadata["prompt_clear"].str.strip(",. ")

## Save original

In [226]:
metadata.to_parquet("../input/metadata/metadata.parquet", index=False)

## Save filtred 

In [17]:
metadata_filt = metadata#.sample(frac=1, random_state=42)
metadata_filt = metadata.drop_duplicates("prompt_first", keep="last")
metadata_filt = metadata_filt.drop_duplicates("prompt_max", keep="last")
metadata_filt = metadata_filt.drop_duplicates("prompt_clear", keep="last")
print(metadata.shape, metadata_filt.shape)

(2111939, 7) (1040330, 7)


In [18]:
def filter_metadata(df, 
                    img_size_min, img_size_max, 
                    img_max_ratio_diff, 
                    prompt_words_min, prompt_words_max, 
                    prompt_is_english,
                    drop_duplicates_by_head, 
                    drop_duplicates_by_tail, 
                    drop_duplicates_word):
    def is_english_only(string):
        for s in string:
            cat = unicodedata.category(s)         
            if (cat not in ['Ll', 'Lu', 'Nd', 'Po', 'Pd', 'Zs']) or (not cat.isascii()):
                return False
        
        return True
    
    df = df.copy()

    img_hw_cond = (
        df["width"].between(img_size_min, img_size_max) & 
        df["height"].between(img_size_min, img_size_max)
    )
    df["size_ratio"] = df["height"] / df["width"]
    img_ratio_cond = df["size_ratio"].between(1/img_max_ratio_diff, img_max_ratio_diff)

    df["prompt_clear"] = df["prompt_clear"].astype(str).str.strip()
    
    df["num_words"] = df['prompt_clear'].str.split(" ").apply(len)
    prompt_num_words_cond = df["num_words"].between(prompt_words_min, prompt_words_max)
    
    df["is_english"] = df["prompt_clear"].apply(is_english_only)
    if prompt_is_english:
        df = df[df["is_english"]]
    
    prompt_empty_cond = (df["prompt_clear"] != "")
    if drop_duplicates_by_head:
        df['head'] = df['prompt_clear'].str.split(" ").str[:drop_duplicates_word].str.join(" ")
        df.drop_duplicates(subset='head', inplace=True, keep="last")
    
    if drop_duplicates_by_tail:
        df['tail'] = df['prompt_clear'].str.split(" ").str[-drop_duplicates_word:].str.join(" ")
        df.drop_duplicates(subset='tail', inplace=True, keep="last")
    
    
    df = df[
        img_hw_cond &
        img_ratio_cond &
        prompt_empty_cond &
        prompt_num_words_cond
    ]#[["image_name", "prompt"]]
    
    df = df.reset_index(drop=True)
    return df

In [19]:
duplwords = 1

metadata_filt_dupl = filter_metadata(
    metadata_filt, 
    img_size_min=128, 
    img_size_max=1280, 
    img_max_ratio_diff=2, 
    prompt_words_min=3, 
    prompt_words_max=100,
    prompt_is_english=True,
    drop_duplicates_by_head=True,
    drop_duplicates_by_tail=True,
    drop_duplicates_word=duplwords
)
metadata_filt_dupl.shape

  df = df[


(9695, 12)

In [20]:
metadata_filt_dupl["image_name"].str.split("/").str[0].value_counts()

DiffusionDB_2M       6993
sd3                  2282
gustavosta-sd2-v2     420
Name: image_name, dtype: int64

In [21]:
(
    metadata_filt_dupl[["image_name", "prompt"]]
        .to_parquet(f"../input/metadata/metadata_duplwords_{duplwords}.parquet", index=False)
)

In [4]:
for i in range(2, 8, 1):
    print(i, pd.read_parquet(f"../input/metadata/metadata_duplwords_{i}.parquet").shape)

2 (89890, 2)
3 (241116, 2)
4 (393862, 2)
5 (503293, 2)
6 (572328, 2)
7 (625528, 2)
