In [3]:

import argparse

import time

from copy import deepcopy

from PIL import Image
import numpy as np
from tqdm import tqdm

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import pandas as pd
import os

try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC
import torchvision.models as models

import clip
from clip.custom_clip import get_coop
from clip.cocoop import get_cocoop
from data.imagnet_prompts import imagenet_classes
from data.datautils import AugMixAugmenter, build_dataset
from utils.tools import Summary, AverageMeter, ProgressMeter, load_model_weight, set_random_seed, create_logger
from data.cls_to_names import *
from data.fewshot_datasets import fewshot_datasets
from data.imagenet_variants import thousand_k_to_200, imagenet_a_mask, imagenet_r_mask, imagenet_v_mask
from clip_retrieval.clip_client import ClipClient, Modality

from collections import defaultdict
%load_ext autoreload
%autoreload 2

client = ClipClient(
    url="http://127.0.0.1:1234/knn-service",
    indice_name='laion_400m',
    modality=Modality.IMAGE,
    num_images=10,
    deduplicate=False,
)
client_backup = ClipClient(
    url="http://127.0.0.1:1234/knn-service",
    indice_name='laion_400m',
    modality=Modality.IMAGE,
    num_images=200,
    deduplicate=False,
)

client_backup2 = ClipClient(
    url="http://127.0.0.1:1234/knn-service",
    indice_name='laion_400m',
    modality=Modality.IMAGE,
    num_images=1000,
    deduplicate=False,
)

#Parameters
tta_steps = 1
which_loss = "cosine"
gpu = 7
print_freq = 1000

## Class to names mapping
fewshot_datasets = ['DTD', 'Flower102', 'Food101', 'Cars', 'SUN397', 
                    'Aircraft', 'Pets', 'Caltech101', 'UCF101', 'eurosat']
# test_sets = 'Caltech101/DTD/Food101'
# test_sets= 'Cars/SUN397/Aircraft/'
test_sets = 'Pets/UCF101/eurosat'

##parameters
arch='ViT-B/16'
n_ctx=4
ctx_init="a_photo_of_a"
lr = 5e-3

  from .autonotebook import tqdm as notebook_tqdm


In [58]:

def select_confident_samples(logits, top):
    batch_entropy = -(logits.softmax(1) * logits.log_softmax(1)).sum(1)
    idx = torch.argsort(batch_entropy, descending=False)[:int(batch_entropy.size()[0] * top)]
    return logits[idx], idx

def avg_entropy(outputs):
    logits = outputs - outputs.logsumexp(dim=-1, keepdim=True) # logits = outputs.log_softmax(dim=1) [N, 1000]
    avg_logits = logits.logsumexp(dim=0) - np.log(logits.shape[0]) # avg_logits = logits.mean(0) [1, 1000]
    min_real = torch.finfo(avg_logits.dtype).min
    avg_logits = torch.clamp(avg_logits, min=min_real)
    return -(avg_logits * torch.exp(avg_logits)).sum(dim=-1)

In [59]:

def accuracy(output, target, topk=(1,), caption=None, logger=None):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
        if output.shape[0] == 1:#only image prediction
            logit_k, pred = output.topk(maxk, 1, True, True)
            pred = pred.t()
            correct = pred.eq(target.view(1, -1).expand_as(pred))
        else: # evaluate captions
            bag = []
            # # length = max(5, output.shape[0]-1)
            # cap_pred = output[1:]
            # # cap_pred = torch.mean(cap_pred, 0,  keepdim=True)
            # _, pred = cap_pred.topk(maxk, 1, True, True) #5, 1 #candidate labels
            # pred = pred.reshape(maxk, 1)
            pred = torch.mean(output, 0, keepdim=True)
            _, pred = pred.topk(maxk, 1, True, True)
            pred = pred.reshape(maxk, 1)
            correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            # if k == 1 and correct_k.item() == 0:

                # pred = pred.squeeze().tolist()
                # pred = [cls2names[lb] for lb in pred]

                # if logger: logger.info("wrong prediction, target {} & predicted value {}".format(target, pred))
            # elif k==1 and correct_k.item() == 1:
                # logger.info("wrong prediction , logit: ", output)
                # pred = pred.squeeze().tolist()
                # pred = [cls2names[lb] for lb in pred]

            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [61]:

def return_caption(img_path, retrieve_K=1):
    try:
        query_res = client.query(image=img_path)
        assert len(query_res) >= retrieve_K
        query_res = query_res[:retrieve_K]
        retrieved_txt= [D['caption'] for D in query_res]
        retrieved_url = [D['url'] for D in query_res]
        retrieved_score = [D['similarity'] for D in query_res]
        return retrieved_txt, retrieved_score
    except:
        query_res = client_backup2.query(image=img_path)
        if isinstance(query_res, list) and len(query_res) >= retrieve_K:
            query_res = query_res[:retrieve_K]
            retrieved_txt= [D['caption'] for D in query_res]
            retrieved_url = [D['url'] for D in query_res]
            retrieved_score = [D['similarity'] for D in query_res]
            return retrieved_txt, retrieved_score
        else:
            return None, None


                    

In [62]:
def test_time_adapt_eval_image(val_loader, model, model_state, optimizer, optim_state, scaler, save_result=None):
    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
    top2 = AverageMeter('Acc@2', ':6.2f', Summary.AVERAGE)
    top3 = AverageMeter('Acc@3', ':6.2f', Summary.AVERAGE)
    top4 = AverageMeter('Acc@4', ':6.2f', Summary.AVERAGE)
    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
    
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, top1, top2, top3, top4, top5],
        prefix='Test: ',
        logger = None)

    # reset model and switch to evaluate mode
    model.eval()
    with torch.no_grad():
        model.reset()
    end = time.time()
    cnt_empty = 0
    if save_result == None: save_result = defaultdict(list)
    
    for i, (image, target, imagepath) in tqdm(enumerate(val_loader)): 
        assert gpu is not None
        # print("Image Path ", imagepath[0])
        save_result['image_path'].append(imagepath[0])
        
        target = target.cuda(gpu, non_blocking=True)
        
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                #image
                output = model.inference(image.cuda(gpu, non_blocking=True))

        # output = output / output.norm(dim=-1, keepdim=True)
        save_result['image_entropy'].append('{:.4f}'.format(avg_entropy(output)))
        # output = torch.nn.functional.softmax(output, dim=-1)
        logit_k, pred = output.topk(2, 1, True, True) #1,1
        
        pred = pred[:,0].t()
        logit_k= logit_k.squeeze()
        correct = pred.eq(target)
        # print(correct)
        correct = correct.reshape(-1).float().sum(0, keepdim=True).item() #1 or 0
        save_result['image_correct'].append(int(correct))
        save_result['image_logit'].append('{:.4f}'.format(logit_k[0].item()))
        if correct == 1:
            # correct label - top2
            # print(logit_k)
            save_result['image_gap'].append('{:.4f}'.format(logit_k[0].item() - logit_k[1].item()))
        else:
            # incorrect top1 - real label
            # print(target, output.squeeze()[target], logit_k )
            save_result['image_gap'].append('{:.4f}'.format(logit_k[0].item() - output.squeeze()[target].item()))
        
                
        acc1, acc2, acc3, acc4, acc5 = accuracy(output, target, topk=(1, 2, 3, 4, 5), caption=None, logger=None)
        # print(acc1, acc2, acc3, acc4, acc5)
        top1.update(acc1[0], image.size(0))
        top2.update(acc2[0], image.size(0))
        top3.update(acc3[0], image.size(0))
        top4.update(acc4[0], image.size(0))
        top5.update(acc5[0], image.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if (i+1) % print_freq == 0:
            progress.display(i)
        

    progress.display_summary()
    return [top1.avg, top2.avg, top3.avg, top4.avg, top5.avg], save_result

def test_time_adapt_eval_caption(val_loader, model, model_state, optimizer, optim_state, scaler, save_result=False):
    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
    top2 = AverageMeter('Acc@2', ':6.2f', Summary.AVERAGE)
    top3 = AverageMeter('Acc@3', ':6.2f', Summary.AVERAGE)
    top4 = AverageMeter('Acc@4', ':6.2f', Summary.AVERAGE)
    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
    
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, top1, top2, top3, top4, top5],
        prefix='Test: ',
        logger = None)

    # reset model and switch to evaluate mode
    model.eval()
    with torch.no_grad():
        model.reset()
    end = time.time()
    cnt_empty = 0
    assert save_result != None
    
    for i, (image, target, imagepath) in tqdm(enumerate(val_loader)): 
        assert gpu is not None
        # print("Image Path ", imagepath[0])
        # save_result['image_path'] = imagepath[0]
        
        target = target.cuda(gpu, non_blocking=True)
        
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                #caption
                retrieved_Caption, _ = return_caption(imagepath[0])
                if retrieved_Caption==None:
                    cnt_empty +=1
                    save_result['caption'].append("None")
                    save_result['caption_entropy'].append(np.nan)
                    save_result['caption_correct'].append(np.nan)
                    save_result['caption_logit'].append(np.nan)
                    save_result['caption_gap'].append(np.nan)
                    continue
                save_result['caption'].append(retrieved_Caption[0])
                output_caption = model.caption_ensemble(retrieved_Caption)

        # output_caption = output_caption / output_caption.norm(dim=-1, keepdim=True)
        save_result['caption_entropy'].append('{:.4f}'.format(avg_entropy(output_caption)))
        # print(output_caption.shape)
        # output_caption = torch.nn.functional.softmax(output_caption, dim=-1)
        # print(output_caption)
        logit_k, pred = output_caption.topk(2, 1, True, True) #1,2
        # print(logit_k, pred)

        logit_k = logit_k.squeeze()
        if logit_k[0] == 1:
            print(output_caption)
        
        pred = pred[:,0].t()
        correct = pred.eq(target)
        correct = correct.reshape(-1).float().sum(0, keepdim=True).item() #1 or 0
        save_result['caption_correct'].append(int(correct))
        save_result['caption_logit'].append('{:.4f}'.format(logit_k[0].item()))
        if correct == 1:
            # correct label - top2
            # print(logit_k)
            save_result['caption_gap'].append('{:.4f}'.format(logit_k[0].item() - logit_k[1].item()))
        else:
            # incorrect top1 - real label
            # print(target, output.squeeze()[target], logit_k )
            save_result['caption_gap'].append('{:.4f}'.format(logit_k[0].item() - output_caption.squeeze()[target].item()))
        
         
        acc1, acc2, acc3, acc4, acc5 = accuracy(output_caption, target, topk=(1, 2, 3, 4, 5), caption=None, logger=None)
        # print(acc1, acc2, acc3, acc4, acc5)
        top1.update(acc1[0], image.size(0))
        top2.update(acc2[0], image.size(0))
        top3.update(acc3[0], image.size(0))
        top4.update(acc4[0], image.size(0))
        top5.update(acc5[0], image.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if (i+1) % print_freq == 0:
            progress.display(i)
    
    print("empty caption count = {}".format(cnt_empty))
    progress.display_summary()
    return [top1.avg, top2.avg, top3.avg, top4.avg, top5.avg], save_result

In [64]:
# load model
if test_sets in fewshot_datasets:
    classnames = eval("{}_classes".format(test_sets.lower()))
model = get_coop(arch, test_sets, gpu, n_ctx, ctx_init)
model_state = None

cross_check = set()
for name, param in model.named_parameters():

    if "prompt_learner" not in name:
        param.requires_grad_(False)
    if param.requires_grad : cross_check.add(name)
print("tuing parameters ", cross_check)

print("=> Model created: visual backbone {}".format(arch))

assert gpu is not None
torch.cuda.set_device(gpu)
model = model.cuda(gpu)

trainable_param = model.prompt_learner.parameters()
optimizer = torch.optim.AdamW(trainable_param, lr)
optim_state = deepcopy(optimizer.state_dict())

# setup automatic mixed-precision (Amp) loss scaling
scaler = torch.cuda.amp.GradScaler(init_scale=1000)

print('=> Using native Torch AMP. Training in mixed precision.')

cudnn.benchmark = True

dtype  torch.float32
Initializing the contect with given words: [a_photo_of_a]
Initial context: "a photo of a"
Number of context words (tokens): 4
tuing parameters  {'prompt_learner.ctx'}
=> Model created: visual backbone ViT-B/16
=> Using native Torch AMP. Training in mixed precision.


In [15]:

def get_max_accuracy(df, total_length):
    img_ent_m = df['image_entropy'].astype('float').mean()
    cap_ent_m = df['caption_entropy'].astype('float').mean()
    print("image ent mean  {} caption ent mean {}".format(img_ent_m,cap_ent_m ))
    df['image_entropy'] = df['image_entropy'].astype('float')
    df['caption_entropy'] = df['caption_entropy'].astype('float')


    table = dict()
    for c_tau in range(0, 1):
        for i_tau in range(0, 1):
            c_tau *= 0.01
            i_tau *= 0.01
            # case1
            pred_cap = df.loc[(df.image_entropy > img_ent_m * (1+i_tau)) & (df.caption_entropy < cap_ent_m * (1-c_tau))]
            pred_cap_n = pred_cap.caption_correct.sum()
            
            
            # case2
            pred_img = df.loc[(df.image_entropy <= img_ent_m * (1+i_tau)) | (df.caption_entropy >= cap_ent_m * (1-c_tau))]
            # assert pred_cap.shape[0]+ pred_img.shape[0] == total_length, ( pred_cap.shape[0]+ pred_img.shape[0])
            pred_img_n = pred_img.image_correct.sum()
            max_acc = (pred_cap_n+pred_img_n)/total_length
            table[(c_tau, i_tau, pred_cap_n)] = max_acc
            
            if i_tau == 0 and c_tau == 0:
                print("pred cap shape {} out of {}".format(pred_cap_n, pred_cap.shape))
                print("default {:2f}".format(max_acc * 100))
    max_key = max(table, key=table.get)
    # print(max_key, table[max_key])
    return (max_key, table[max_key])

In [65]:
resolution = 224
workers = 4
dataset_mode = 'test'
data = '/data/seongha'
import sys
from collections import defaultdict
    # norm stats from clip.load()
normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                     std=[0.26862954, 0.26130258, 0.27577711])
    
    # iterating through eval datasets
datasets = test_sets.split("/")
results = {}

# with open('inferece_image_caption{}.txt'.format(test_sets), 'w') as f:
    # sys.stdout = f


for set_id in datasets:
    Dict = defaultdict(list)
    data_transform = transforms.Compose([
        transforms.Resize(resolution, interpolation=BICUBIC),
        transforms.CenterCrop(resolution),
        transforms.ToTensor(),
        normalize,
    ])
    batchsize = 1
    print("evaluating: {}".format(set_id))
    classnames = eval("{}_classes".format(set_id.lower()))
    model.reset_classnames(classnames, arch)

    val_dataset = build_dataset(set_id, data_transform, data, mode=dataset_mode)
    total_length = len(val_dataset)
    print("number of test samples: {}".format(len(val_dataset)))

    val_loader = torch.utils.data.DataLoader(
                val_dataset,
                batch_size=batchsize, shuffle=True,
                num_workers=workers, pin_memory=True)
        
    results['image'], tmp = test_time_adapt_eval_image(val_loader, model, model_state, optimizer, optim_state, scaler, Dict)
    Dict = tmp
    results['caption'], tmp = test_time_adapt_eval_caption(val_loader, model, model_state, optimizer, optim_state, scaler, Dict)
    Dict = tmp
    # assert len(Dict['image_path']) == len(Dict['caption']) and len(Dict['image_correct']) == len(Dict['caption_correct']), [len(v) for k, v in Dict.items()]
    del val_dataset, val_loader

    # try:
    #     print("=> Acc. on testset [{}]: @1 {}/ @2 {}/ @3 {}/ @4 {}/ @5 {}".format(set_id, results[set_id][0], results[set_id][1], results['image'][2], results['image'][3], results['image'[4], results['image'][5]]))
    # except:
    #     print("=> Acc. on testset [{}]: {}".format(set_id, results[set_id]))
# sys.stdout = sys.__stdout__
    tmp = {k: v for k, v in Dict.items() if "image" in k}
    df_img = pd.DataFrame(tmp)
    df_img = df_img.reset_index()

    tmp = {k: v for k, v in Dict.items() if "caption" in k}
    df_cap = pd.DataFrame(tmp)
    df_cap = df_cap.reset_index()

    path = './notebook/inference_image_caption'
    os.makedirs(path, exist_ok=True)
    df = pd.DataFrame(Dict)
    df.to_csv(os.path.join(path, 'inference_image_caption_{}.csv'.format(set_id)))

    with open(os.path.join(path,'inference_image_caption_{}.txt'.format(set_id)), 'w') as f:
        
        img_corr_ind = df_img.loc[df_img['image_correct'] == 1, 'index'].to_list()
        cap_corr_ind = df_cap.loc[df_cap['caption_correct'] == 1, 'index'].to_list()
        neither = set(df_img.loc[df_img['image_correct'] == 0, 'index'].to_list()) & set(df_cap.loc[df_cap['caption_correct'] == 0, 'index'].to_list())
        #image accuracy, caption ensemble accuracy
        f.write("1. Image accuracy {:.4f}, Caption Accuracy {:.4f}\n".format( len(img_corr_ind)/total_length, len(cap_corr_ind)/total_length))
        # 4 cases
        union = set(img_corr_ind) | set(cap_corr_ind)
        f.write("2: 4 cases\n")
        f.write("Union :{}\n".format( len(union)))
        intersection = set(img_corr_ind) & set(cap_corr_ind)
        f.write("intersection {}\n".format( len(intersection)))
        img_diff = set(img_corr_ind) - set(cap_corr_ind)
        f.write("image only {}\n".format(len(img_diff)))
        cap_diff = set(cap_corr_ind) - set(img_corr_ind)
        f.write("cap only {}\n".format( len(cap_diff)))
        f.write("neither {}\n".format( len(neither)))
        # max accuracy
        f.write("3. Max accuracy\n")
        f.write("Max accuracy: {:.4f}\n".format(len(union)/total_length*100)) 
        #entropy, logit gap
        f.write("4. Entropy & Logit Gap\n")
        f.write("Image\n")
        img_correct = df_img.loc[df_img['image_correct'] == 1]
        f.write("correct\n")
        f.write(" {}\n".format(str(img_correct.shape)))
        f.write("top1 - top2 mean, std\n")
        f.write("{} {}\n".format(img_correct['image_gap'].astype(float).mean(), img_correct['image_gap'].astype(float).std() ))
        f.write("Entropy mean {}\n".format(str(img_correct['image_entropy'].astype(float).mean())))
        f.write("")
        img_wrong = df_img.loc[df_img['image_correct'] == 0]
        f.write("wrong\n")
        f.write(" {}\n".format(str(img_wrong.shape)))
        f.write("pred(top1) - target mean, std\n")
        f.write("{} {}\n".format(img_wrong['image_gap'].astype(float).mean(), img_wrong['image_gap'].astype(float).std() ))
        f.write("Entropy mean {}\n".format(str(img_wrong['image_entropy'].astype(float).mean())))
        f.write("-"*10 + "\n")
        
        f.write("Caption\n")
        cap_correct = df_cap.loc[df_cap['caption_correct'] == 1]
        f.write("correct\n")
        f.write(" {}\n".format(str(cap_correct.shape)))
        f.write("top1 - top2 mean, std\n")
        f.write("{} {}\n".format(cap_correct['caption_gap'].astype(float).mean(), cap_correct['caption_gap'].astype(float).std() ))
        f.write("Entropy mean {}\n".format(str(cap_correct['caption_entropy'].astype(float).mean())))
        f.write("")
        cap_wrong = df_cap.loc[df_cap['caption_correct'] == 0]
        f.write("wrong\n")
        f.write(" {}\n".format(str(cap_wrong.shape)))
        f.write("pred(top1) - target mean, std\n")
        f.write("{} {}\n".format(cap_wrong['caption_gap'].astype(float).mean(), cap_wrong['caption_gap'].astype(float).std() ))
        f.write("Entropy mean {}\n".format(str(cap_wrong['caption_entropy'].astype(float).mean())))
            

        for i_set, i_name in zip([img_diff, cap_diff, intersection, neither, union], ['img_diff', 'cap_diff', 'intersection', 'neither', 'union']):
            df_ = df.iloc[list(i_set)]
            f.write("{} {}\n".format(i_name, df_.shape))
            f.write("Image Entropy mean {}\n".format(str(df_['image_entropy'].astype(float).mean())))
            f.write("Caption Entropy mean {}\n".format(str(df_['caption_entropy'].astype(float).mean())))
        print("how many nan values ? {}\n".format(df.loc[cap_diff].isna().sum()))
        f.write("how many nan values ? {}\n".format(df.loc[cap_diff].isna().sum()))
        (c_tau, i_tau, _), max_acc = get_max_accuracy( df, total_length)
        print("Caption tau {}, image tau {}, max acc {:.4f}\n".format(c_tau, i_tau, max_acc))
        f.write("Caption tau {}, image tau {}, max acc {:.4f}\n".format(c_tau, i_tau, max_acc))
        
            

evaluating: Flower102
number of test samples: 2463


1005it [00:34, 29.28it/s]

Test: [ 999/2463]	Time  0.038 ( 0.034)	Acc@1 100.00 ( 66.80)	Acc@2 100.00 ( 79.80)	Acc@3 100.00 ( 82.70)	Acc@4 100.00 ( 84.30)	Acc@5 100.00 ( 85.80)


2005it [01:08, 29.68it/s]

Test: [1999/2463]	Time  0.033 ( 0.034)	Acc@1 100.00 ( 66.70)	Acc@2 100.00 ( 79.05)	Acc@3 100.00 ( 82.20)	Acc@4 100.00 ( 83.80)	Acc@5 100.00 ( 84.95)


2463it [01:24, 29.16it/s]

 *  Acc@1 67.032 Acc@2 79.375 Acc@3 82.217 Acc@4 83.719 Acc@5 84.896



1000it [09:18,  2.34it/s]

Test: [ 999/2463]	Time  0.255 ( 0.559)	Acc@1 100.00 ( 42.40)	Acc@2 100.00 ( 50.60)	Acc@3 100.00 ( 54.00)	Acc@4 100.00 ( 56.70)	Acc@5 100.00 ( 58.40)


2000it [18:30,  2.89it/s]

Test: [1999/2463]	Time  0.217 ( 0.555)	Acc@1 100.00 ( 43.35)	Acc@2 100.00 ( 50.65)	Acc@3 100.00 ( 54.45)	Acc@4 100.00 ( 57.25)	Acc@5 100.00 ( 58.75)


2463it [22:35,  1.82it/s]

empty caption count = 0
 *  Acc@1 43.159 Acc@2 50.792 Acc@3 54.365 Acc@4 57.328 Acc@5 58.831
img_diff (943, 10)
Image Entropy mean 0.7874897136797455
Caption Entropy mean 0.5247625662778367

cap_diff (355, 10)
Image Entropy mean 1.75411661971831
Caption Entropy mean 0.09797267605633803

intersection (709, 10)
Image Entropy mean 0.783781946403385
Caption Entropy mean 0.11586713681241184

neither (456, 10)
Image Entropy mean 1.7897495614035088
Caption Entropy mean 0.5187515350877193

union (2007, 10)
Image Entropy mean 0.9571577478824115
Caption Entropy mean 0.3048237169905331






evluate

In [16]:
import pandas as pd
di = dict()
for each in fewshot_datasets:
    print(each)
    path = '/home/seongha/TPT/notebook/inference_image_caption/{}/inference_image_caption_{}.csv'.format(arch.replace('/', ''), each)
    df = pd.read_csv(path)
    total_length = df.shape[0]
    print(total_length)
    print("caption entropy shape {}".format(df['caption_entropy'].shape))
    (c_tau, i_tau, pred_cap_n), max_acc = get_max_accuracy(df, total_length)
    print("Caption tau {}, image tau {}, max acc {:.4f}".format(c_tau, i_tau, max_acc * 100))
    print("pred cap n {}\n".format(pred_cap_n))
    di[each] = {'i_tau': i_tau, 'c_tau': c_tau}
    

DTD
1692
caption entropy shape (1692,)
image ent mean  1.7448628250591016 caption ent mean 1.1990325650118203
pred cap shape 201 out of (460, 11)
default 50.118203
Caption tau 0.0, image tau 0.0, max acc 50.1182
pred cap n 201

Flower102
2463
caption entropy shape (2463,)
image ent mean  1.1113038570848561 caption ent mean 0.34443032886723507
pred cap shape 424 out of (778, 11)
default 70.889160
Caption tau 0.0, image tau 0.0, max acc 70.8892
pred cap n 424

Food101
30300
caption entropy shape (30300,)
image ent mean  0.5977752244224422 caption ent mean 0.44765492260470646
pred cap shape 4683.0 out of (7591, 11)
default 83.326733
Caption tau 0.0, image tau 0.0, max acc 83.3267
pred cap n 4683.0

Cars
8041
caption entropy shape (8041,)
image ent mean  1.1570252083074244 caption ent mean 0.5936343365253077
pred cap shape 1041 out of (2262, 11)
default 65.563985
Caption tau 0.0, image tau 0.0, max acc 65.5640
pred cap n 1041

SUN397
19850
caption entropy shape (19850,)
image ent mean  1.4

In [16]:
Path = "./notebook/inference_image_caption/ViT-B16/inference_image_caption_eurosat.csv"
import pandas as pd
df = pd.read_csv(Path)

for a, b in zip([1,1,0,0], [0,1,1,0]):
    print(df.loc[(df.image_correct == a) & (df.caption_correct == b)].shape)

(2829, 11)
(528, 11)
(738, 11)
(4005, 11)
