# Criteria

## 1. Selected pairs of images with similar scores. 

    To get more detailed rank info. Because the scores are assigned by our scoring model, this might be accurate when predicting the selection of images with large score delta. But for images with small score delta, it performs badly.
## 2. Selected pairs within similar images.

    Currently,  annotator’s selection is strongly affected by the image topic or style. I.e., the annotator may alway choose Nintendo Mario style images, therefore, the scoring models will assign high scores to them. 
    This policy will force the annotator to focus on the image quality and may help us improve generation.

# Implement

## 1. Function: get_candidate_pairs_within_category

I will first provide a general function to get candidate pairs within category

Input:
- categories: np.ndarray[int], shape is (N,)
- max_pairs: int, max selecting pairs. 
- max_pairs should 0 < max_pairs < (N / n_categories) ** 2.
    we will attempt to select (max_pairs / n_categories) pairs within each category.
    
Output:

pairs: list[(index, index)], seleted pairs, index of input categories.


## 2. Function: get_candidate_pairs_by_score

I use 2 way to binning scores to categories:
By fixed step bins
By quantities
	
	I will provide a function to get candidate pairs with similar scores

Input:
- scores: np.ndarray[float], shape is (N,)
- max_pairs: int, max selecting pairs. 
- n_bins: int, number of categories to be divided
- use_quantities: bool, to use quantities or fixed step bins

Output:

pairs: list[(index, index)], seleted pairs, index of input scores.
	

## 3. Function: get_candidate_pairs_by_embedding
	
I use kmeans to divide images into categories of clusters.

Input:
- embeddings: np.ndarray, shape is (N, 768)
- max_pairs: int, max selecting pairs. 
- n_clusters: int, number of categories to be divided

Output:

pairs: list[(index, index)], seleted pairs, index of input embeddings.

These 2 criteria can be used with existing filters, we can filter images with score / variance / date, and pass the uuids and corresponding scores or embeddings to the function, and get candidate pairs.


In [8]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pandas as pd
import numpy as np

import os
import sys
import json

import msgpack

from tqdm.auto import tqdm

import torch

In [9]:
sys.path.append(os.path.abspath('../kcg-ml-image-pipeline/'))

from utility.active_learning.pairs import get_candidate_pairs_by_score, get_candidate_pairs_by_embedding, embedding_to_category, get_candidate_pairs_within_category
from utility.active_learning.samples import get_min_distance_to_representative_samples

In [10]:
ROOT = '../dataset/'

DATASETs = [
    'environmental', 
    'character', 
    'icons', 
    'mech', 
    'waifu',
    'propaganda-poster'
]

SAVE_DIR = './result/active_learning/1218/'

# save image info

In [11]:
dataset_name = DATASETs[-1]

In [None]:
def get_score_from_embs(embs, model, batch_size, preprocess=None):
    
    scores = list()

    with torch.no_grad():
        with torch.cuda.amp.autocast(True):
    
            for i in tqdm(range(0, len(embs), batch_size), leave=False):
                
                x = torch.tensor(embs[i:i+batch_size]).cuda().half()
                
                if preprocess is not None:
                    x = preprocess(x)
    
                score = model(x)[..., 0]
    
                scores.append(score.detach().cpu().numpy())

    scores = np.concatenate(scores, axis=0)
    
    return scores

In [3]:
def save_image_info(dataset_name):

    #
    
    js = json.load(open(f'data/{dataset_name}/data.json'))
    
    job_uuids = list()
    samples = list()
    
    for info in tqdm(js.values(), total=len(js), leave=False):
    
        file_path = os.path.splitext(info['file_path'].split('_')[0])[0]
        
        path = os.path.join(ROOT, 'clip', f'{file_path}_clip.msgpack')
    
        with open(path, 'rb') as f:
            mp = msgpack.load(f)
    
        job_uuids.append(info['job_uuid'])
        samples.append(np.array(mp['clip-feature-vector']))
    
    job_uuids = np.array(job_uuids)
    samples = np.concatenate(samples, axis=0)

    #
        
    df = pd.DataFrame(
        job_uuids.reshape(-1, 1), 
        columns=['job_uuid']
    )
    
    # score
    
    vision_model = torch.nn.Linear(samples.shape[-1], 1, bias=True).cuda().eval()
    vision_model.load_state_dict(torch.load(os.path.join('./weight/004', dataset_name, 'clip_vision.pt')))

    npz = np.load(os.path.join('./weight/004', dataset_name, 'clip_vision.npz'))
    mean, std = npz['mean'], npz['std']
    
    score = get_score_from_embs(samples, vision_model, batch_size=1024)
    
    df['sigma_score'] = (score - mean) / std
    
    # distance
    
    representative_names = json.load(open(os.path.join('./data', dataset_name, 'representative.json')))['representative']
    representative_indices = list(map(path_to_index.get, representative_names))
    representative_samples = samples[representative_indices]
    
    df['min_distance_to_representative_samples'] = get_min_distance_to_representative_samples(samples, representative_samples, distance_type='cosine')

    # 
    
    for n_clusters in [10, 100]:
        if n_clusters > samples.shape[0] / 100:
            break
        df[f'category_{n_clusters}'] = embedding_to_category(embeddings=samples, n_clusters=n_clusters)

    #

    os.makedirs(os.path.join(SAVE_DIR, dataset_name), exist_ok=True)
    
    df.to_csv(os.path.join(SAVE_DIR, dataset_name, 'image_info.csv'), index=False)


In [4]:
for dataset_name in DATASETs:
    save_image_info(dataset_name)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

# save_rank_queue

In [5]:
def get_job_uuid_pairs(df, pairs):
    
    indices_1, indices_2 = zip(*pairs)

    job_uuid_1s = df['job_uuid'].iloc[list(indices_1)]
    job_uuid_2s = df['job_uuid'].iloc[list(indices_2)]
    
    return [((job_uuid_1, job_uuid_2) if job_uuid_1 < job_uuid_2 else (job_uuid_2, job_uuid_1)) for job_uuid_1, job_uuid_2 in zip(job_uuid_1s, job_uuid_1s)]

In [6]:
def save_rank_queue(dataset_name):

    pmt_path = os.path.join('./data', dataset_name, 'prompt.json')
    
    prompts = json.load(open(pmt_path))
    
    ranked_pairs = set()
    for fname in tqdm(os.listdir(os.path.join(ROOT, 'ranking', dataset_name)), leave=False):
        js = json.load(open(os.path.join(ROOT, 'ranking', dataset_name, fname)))
        
        file_hash_1 = js['image_1_metadata']['file_hash']
        file_hash_2 = js['image_2_metadata']['file_hash']
    
        try:
            job_uuids_1 = prompts[file_hash_1]['job_uuid']
            job_uuids_2 = prompts[file_hash_2]['job_uuid']
        except:
            continue
        
        ranked_pairs.add((job_uuids_1, job_uuids_2))
        ranked_pairs.add((job_uuids_2, job_uuids_1))
    
    #
    
    df = pd.read_csv(os.path.join(SAVE_DIR, dataset_name, 'image_info.csv')).dropna()
    df.query('sigma_score > .75', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    #
    
    result = df.query(f'min_distance_to_representative_samples > 0.25')[['job_uuid']].copy()
    result['policy'] = 'far_distance_to_ranked_images'
    result.to_csv(os.path.join(SAVE_DIR, dataset_name, 'images.csv') , index=False)
    
    #
    
    results = list()
    
    for n_bins in [10, 100]:
        
        pairs = get_candidate_pairs_by_score(df['sigma_score'].values, max_pairs=1000, n_bins=n_bins, use_quantiles=True)
        
        pairs = get_job_uuid_pairs(df, pairs)
        results.extend([pair + (f'same_sigma_score_bin_{n_bins}',) for pair in pairs if pair not in ranked_pairs])
    
    for n_clusters in [10, 100]:
        
        if f'category_{n_clusters}' not in df.columns:
            break
            
        pairs = get_candidate_pairs_within_category(df[f'category_{n_clusters}'].values, max_pairs=1000)
        
        pairs = get_job_uuid_pairs(df, pairs)
        results.extend([pair + (f'same_embedding_cluster_{n_bins}',) for pair in pairs if pair not in ranked_pairs])
        
    results = pd.DataFrame(results, columns=['job_uuid_1', 'job_uuid_2', 'policy'])
    results.drop_duplicates(['job_uuid_1', 'job_uuid_2'], keep='first', inplace=True)
    results.to_csv(os.path.join(SAVE_DIR, dataset_name, 'pairs.csv') , index=False)

In [7]:
for dataset_name in DATASETs:
    save_rank_queue(dataset_name)

  0%|          | 0/529 [00:00<?, ?it/s]

  0%|          | 0/798 [00:00<?, ?it/s]

# select images

In [100]:
import pandas as pd
import os

In [107]:
df = pd.DataFrame(zip(npz['file_paths'], labels), columns=['file_path', 'label'])

In [108]:
target_dir = './image_clustering_clip_vision_kmeans'

for c, g in df.groupby('label'):
    
    os.makedirs(os.path.join(target_dir, f'{c}'), exist_ok=True)
    
    if g.shape[0] < 5:
        continue
    
    selected = np.random.choice(g['file_path'], 5, False)
    
    for file_path in selected:
        file_path = os.path.join('../kcg-ml-image-pipeline/output/dataset/image/', file_path.split('_')[0] + '.jpg')
        os.system(f'cp {file_path} {target_dir}/{c}/')