# Evaluation of Cross-modal Query Suggestion methods on CroQS benchmark

In [1]:
device_str = "cuda:0"
device_str_llm = "cuda:1"#device_str

import os
from dotenv import load_dotenv

load_dotenv()

True

##### Indexing of COCO dataset

In [2]:
from lib.retrievalSystem import RetrievalSystem
import os

hdf5_file_path = os.getenv("HDF5_INDEX_FILE_PATH")

print(f"[+] Going to build the index file in '{hdf5_file_path}' for the coco-dataset collection [+]")
coco_folder = os.path.join( os.getenv("DATA_DIRECTORY_ROOT"), "coco-dataset")
RetrievalSystem.index(coco_folder, hdf5_file_path, "train2017")

[+] Going to build the index file in '/home/giacomo.pacini/thesis-data/hdf5-indexes/coco_train_val_2017_image_embeddings.h5' for the coco-dataset collection [+]
loading annotations into memory...
Done (t=13.11s)
creating index...
index created!
Starting indexing process...
Total images to index: 118287
Stopping because train2017 is in the already processed splits set of the embeddings dataset
embeddings dataset processed splits:  _train2017


In [3]:
RetrievalSystem.index(coco_folder, hdf5_file_path, "val2017")

loading annotations into memory...
Done (t=0.61s)
creating index...
index created!
Starting indexing process...
Total images to index: 5000
Stopping because val2017 is in the already processed splits set of the embeddings dataset
embeddings dataset processed splits:  _val2017


#### CroQS evaluation setup

In [4]:
from lib.CroQS import CroQS

croQS_json_path = "./CroQS_Benchmark_v1.0.0.json"
evaluation_dump_path = os.path.join(os.getenv("REPO_DIRECTORY_ROOT"), "evaluation-dump.json" )
croQS = CroQS(croQS_json_path)
device = "cuda:0"
hdf5_file_path = os.getenv("HDF5_INDEX_FILE_PATH")
croQS.evaluation_init(hdf5_file_path, device, evaluation_dump_path=evaluation_dump_path)

In [5]:
len(croQS.list_queries())

50

In [6]:
from lib.methodsLoader import get_qe_methods_dict, add_method_to_dict

qe_methods_names = [
    'initial-query',
    'clipcap-on-centroids', 
    'decap-on-centroids', 
    ##'clipcap-query_set-of-embeddings', 
    'decap-on-representatives', 
    'clipcap-on-representatives',
    'clipcap-prompting-on-centroids',
    'clipcap-prompting-on-representatives',
    ]

In [7]:
def load_qe_dict(ir_system, load_group_cap : bool = False):
    from lib.methods.decap.Im2TxtProjection import Im2TxtProjector, ProjectionType
    from lib.methods.decap.DecapQueryExpansion import DeCapQueryExpansion
    from lib.methods.decap.decap import get_decap_model
    from lib.methods.groupcap.model import GroupCapLLM, LLMBasePrompts, ImageCaptioningMethods, LLMTypes

    decap_qe = DeCapQueryExpansion.load_object(device_str, 'coco')
    assert isinstance(decap_qe, DeCapQueryExpansion)
    decap_qe.im2txt.ir_system = ir_system
    qe_dict = get_qe_methods_dict(methods_keys=qe_methods_names, decap_qe=decap_qe, device=device_str)

    GC_TYPE = "mistral7b" # "gemma2-2b" # "llama3-8b", "mistral7b"

    if load_group_cap:

        if GC_TYPE == "gemma2b":
            llm_type = LLMTypes.Gemma2b_IT
        elif GC_TYPE == "gemma2-2b":
            llm_type = LLMTypes.Gemma2_2b
        elif GC_TYPE == "mistral7b":
            llm_type = LLMTypes.Mistral7b_IT
        elif GC_TYPE == "llama3-8b":
            llm_type = LLMTypes.LLama3_8b_IT
        else:
            print(f"Using default LLM - LLama3-8b")
            GC_TYPE = "llama3-8b"
            llm_type = LLMTypes.LLama3_8b_IT

        group_cap_obj = GroupCapLLM(ImageCaptioningMethods.decap, decap_qe.get_generated_captions, llm_type, device_map=device_str_llm, image_captioning_method_device=device_str)
        
        prompts_dict = GroupCapLLM.get_prompts_dict()

        group_cap_obj.qe_prompt = prompts_dict['query-expansion']['qe-few-shot-4']
        qe_dict = add_method_to_dict(qe_dict, f'groupcap-{GC_TYPE}-qe-few-shot-4', group_cap_obj.generate_expanded_query, wants_query=True, method_group_name="img-embeddings-whole-cluster")

    return qe_dict

qe_dict = load_qe_dict(croQS.ir_system, True)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
import pandas as pd

q = "italy"
r = croQS.get_query_suggestions_dataframe_for_query(q, qe_dict)

r[0]

Loading already existing qs for the method 'decap-on-representatives'esentatives'

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Loading already existing qs for the method 'decap-on-representatives'esentatives'

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Loading already existing qs for the method 'decap-on-representatives'esentatives'

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Loading already existing qs for the method 'decap-on-representatives'esentatives'

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Unnamed: 0,query,cluster,method,suggested-query,jaccard-q,clip-similarity-cluster,clip-similarity-q,NDCG@10,MAP,Recall-Open-Set-@100,Recall-Closed-Set
0,italy,0,initial-query,italy,1.0,0.264,1.0,0.129875,0.21552,0.527778,0.222222
1,italy,0,decap-on-centroids,a street is shown with a person on it .,0.0,0.27,0.701,0.0,0.018519,0.027778,0.666667
2,italy,0,human,italy streets,0.5,0.286,0.904,0.870125,0.724558,0.861111,0.833333
3,italy,0,clipcap-on-centroids,A street with a building and a parking meter.,0.0,0.247,0.654,0.0,0.016129,0.027778,0.694444
4,italy,0,clipcap-prompting-on-centroids,A street with a building and a motorcycle park...,0.0,0.238,0.558,0.0,0.017857,0.027778,0.666667
5,italy,0,clipcap-on-representatives,A cobblestone street with a bicycle parked on ...,0.0,0.235,0.493,0.112845,0.11875,0.111111,0.722222
6,italy,0,clipcap-prompting-on-representatives,A cobblestone street with a bicycle parked on ...,0.0,0.235,0.493,0.112845,0.11875,0.111111,0.722222
7,italy,0,decap-on-representatives,a street is parked on a small bike and a row .,0.0,0.235,0.635,0.0,0.0,0.0,0.694444
8,italy,0,groupcap-llama3-8b-qe-few-shot-4,italy,1.0,0.264,1.0,0.129875,0.21552,0.527778,0.222222
9,italy,0,groupcap-mistral7b-qe-few-shot-4,"Italy: scenes of streets with people, bicycle...",0.1,0.291,0.586,0.586795,0.42475,0.416667,0.861111


In [None]:
ret_df = croQS.get_query_suggestions_dataframe_all_queries(qe_dict)

ret_df

In [10]:
error_type = 'std'

aggregated_df_2 = ret_df.groupby(['query', 'method']).mean().reset_index()
title_info = "Macro Averaged "

score_cols = list(aggregated_df_2.columns)
for el in ['query','method']: score_cols.remove(el)

# Group by method and compute mean and error for each score column
agg_dict = {col: ['mean', error_type] for col in score_cols}
grouped_2 = aggregated_df_2.groupby('method').agg(agg_dict).reset_index()
grouped_2

  aggregated_df_2 = ret_df.groupby(['query', 'method']).mean().reset_index()


Unnamed: 0_level_0,method,jaccard-q,jaccard-q,clip-similarity-cluster,clip-similarity-cluster,clip-similarity-q,clip-similarity-q,NDCG@10,NDCG@10,MAP,MAP,Recall-Open-Set-@100,Recall-Open-Set-@100,Recall-Closed-Set,Recall-Closed-Set
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
0,clipcap-on-centroids,0.137081,0.103053,0.291701,0.02187,0.722223,0.059104,0.296909,0.147855,0.267135,0.113899,0.397572,0.151151,0.545929,0.170823
1,clipcap-on-representatives,0.124887,0.108797,0.284917,0.02316,0.697049,0.063013,0.306293,0.144765,0.269908,0.112817,0.352138,0.144239,0.528525,0.153014
2,clipcap-prompting-on-centroids,0.166701,0.110075,0.292,0.021619,0.738401,0.061934,0.317051,0.15268,0.284914,0.120592,0.401025,0.154215,0.537269,0.159881
3,clipcap-prompting-on-representatives,0.146038,0.104653,0.285785,0.02371,0.717524,0.066748,0.321381,0.145194,0.281967,0.117821,0.365924,0.157768,0.524735,0.15733
4,decap-on-centroids,0.15786,0.117905,0.302181,0.020141,0.766875,0.061146,0.313076,0.150487,0.282401,0.115949,0.401899,0.155253,0.525882,0.175083
5,decap-on-representatives,0.12925,0.100213,0.294368,0.024613,0.726988,0.064764,0.303921,0.156587,0.26631,0.116174,0.3646,0.146145,0.516019,0.162761
6,groupcap-llama3-8b-qe-few-shot-4,0.557587,0.122979,0.287962,0.018466,0.898617,0.038812,0.352235,0.175422,0.320286,0.142281,0.470177,0.124122,0.394429,0.174507
7,groupcap-mistral7b-qe-few-shot-4,0.242145,0.087782,0.297468,0.017613,0.81286,0.050286,0.346741,0.159789,0.305593,0.11742,0.427078,0.126483,0.461925,0.19056
8,human,0.437972,0.112474,0.298296,0.015991,0.870624,0.032796,0.518537,0.139276,0.445153,0.108554,0.620265,0.121844,0.591542,0.126442
9,initial-query,1.0,0.0,0.279894,0.018076,1.0,0.0,0.177377,0.053571,0.210269,0.053614,0.518257,0.032359,0.190265,0.07174


### Micro averaged results

In [11]:
grouped_df = ret_df.groupby(by='method').mean(numeric_only=True).reset_index()#.sort_values(by=sort_by_column, ascending=False)

grouped_df

Unnamed: 0,method,jaccard-q,clip-similarity-cluster,clip-similarity-q,NDCG@10,MAP,Recall-Open-Set-@100,Recall-Closed-Set
0,clipcap-on-centroids,0.134471,0.292264,0.722292,0.284567,0.257788,0.401399,0.515541
1,clipcap-on-representatives,0.120061,0.286017,0.701397,0.295257,0.260805,0.356807,0.501401
2,clipcap-prompting-on-centroids,0.165905,0.292881,0.741037,0.303769,0.274598,0.406179,0.509666
3,clipcap-prompting-on-representatives,0.14262,0.286841,0.719437,0.304425,0.268266,0.367279,0.498354
4,decap-on-centroids,0.157698,0.303336,0.767037,0.294332,0.269306,0.40481,0.495804
5,decap-on-representatives,0.128546,0.295814,0.728153,0.291639,0.258239,0.369056,0.485694
6,groupcap-llama3-8b-qe-few-shot-4,0.557363,0.288641,0.901281,0.320664,0.295183,0.46575,0.367526
7,groupcap-mistral7b-qe-few-shot-4,0.246336,0.297495,0.813095,0.321879,0.285799,0.418528,0.430926
8,human,0.437193,0.299312,0.870451,0.497983,0.430226,0.626201,0.571772
9,initial-query,1.0,0.280369,1.0,0.162076,0.194837,0.515198,0.170162


### Macro averaged results

In [12]:
import pandas as pd

tmp_df = None

for q in ret_df['query'].unique().tolist():
    df_q = ret_df[ret_df['query'] == q]
    grouped_df_q = df_q[ grouped_df.columns.to_list() + ['query', 'cluster', 'suggested-query'] ].groupby(by='method').mean(numeric_only=True).reset_index()#.sort_values(by=sort_by_column, ascending=False)
    
    if tmp_df is None:
        tmp_df = grouped_df_q
    else:
        tmp_df = pd.concat([tmp_df, grouped_df_q], ignore_index=True)

macro_averaged_df = tmp_df.groupby(by='method').mean(numeric_only=True).reset_index()#.sort_values(by=sort_by_column, ascending=False)
macro_averaged_df

Unnamed: 0,method,jaccard-q,clip-similarity-cluster,clip-similarity-q,NDCG@10,MAP,Recall-Open-Set-@100,Recall-Closed-Set
0,clipcap-on-centroids,0.137081,0.291701,0.722223,0.296909,0.267135,0.397572,0.545929
1,clipcap-on-representatives,0.124887,0.284917,0.697049,0.306293,0.269908,0.352138,0.528525
2,clipcap-prompting-on-centroids,0.166701,0.292,0.738401,0.317051,0.284914,0.401025,0.537269
3,clipcap-prompting-on-representatives,0.146038,0.285785,0.717524,0.321381,0.281967,0.365924,0.524735
4,decap-on-centroids,0.15786,0.302181,0.766875,0.313076,0.282401,0.401899,0.525882
5,decap-on-representatives,0.12925,0.294368,0.726988,0.303921,0.26631,0.3646,0.516019
6,groupcap-llama3-8b-qe-few-shot-4,0.557587,0.287962,0.898617,0.352235,0.320286,0.470177,0.394429
7,groupcap-mistral7b-qe-few-shot-4,0.242145,0.297468,0.81286,0.346741,0.305593,0.427078,0.461925
8,human,0.437972,0.298296,0.870624,0.518537,0.445153,0.620265,0.591542
9,initial-query,1.0,0.279894,1.0,0.177377,0.210269,0.518257,0.190265


In [12]:
ret_df.columns

Index(['query', 'cluster', 'method', 'suggested-query', 'jaccard-q',
       'clip-similarity-cluster', 'clip-similarity-q', 'NDCG@10', 'MAP',
       'Recall-Open-Set-@100', 'Recall-Closed-Set'],
      dtype='object')

In [16]:
error_type = 'std'

aggregated_df = ret_df.groupby(['query', 'method']).mean().reset_index()
title_info = "Macro Averaged "

score_cols = list(aggregated_df.columns)
for el in ['query','method']: score_cols.remove(el)
score_cols

  aggregated_df = ret_df.groupby(['query', 'method']).mean().reset_index()


['jaccard-q',
 'clip-similarity-cluster',
 'clip-similarity-q',
 'NDCG@10',
 'MAP',
 'Recall-Open-Set-@100',
 'Recall-Closed-Set']

In [17]:
error_type = 'std'

aggregated_df = ret_df.groupby(['query', 'method']).mean().reset_index()
title_info = "Macro Averaged "

score_cols = list(aggregated_df.columns)
for el in ['query','method']: score_cols.remove(el)

# Group by method and compute mean and error for each score column
agg_dict = {col: ['mean', error_type] for col in score_cols}
grouped = aggregated_df.groupby('method').agg(agg_dict).reset_index()
grouped

Unnamed: 0_level_0,method,jaccard-q,jaccard-q,clip-similarity-cluster,clip-similarity-cluster,clip-similarity-q,clip-similarity-q,NDCG@10,NDCG@10,MAP,MAP,Recall-Open-Set-@100,Recall-Open-Set-@100,Recall-Closed-Set,Recall-Closed-Set
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
0,groupcap-llama3-8b-qe-few-shot-2,0.659922,0.157349,0.283306,0.018148,0.93984,0.029049,0.277522,0.144467,0.275342,0.112867,0.481073,0.113584,0.314199,0.136502
1,human,0.437972,0.112474,0.298296,0.015991,0.870624,0.032796,0.518537,0.139276,0.445149,0.10855,0.620265,0.121844,0.591542,0.126442
2,initial-query,1.0,0.0,0.279894,0.018076,1.0,0.0,0.177377,0.053571,0.210269,0.053614,0.518257,0.032359,0.190265,0.07174


In [9]:
qq = 'person at the phone'
cc = '3'

croQS._list_clusters_labels(qq)

['0', '1', '2', '3']

In [10]:
croQS.suggested_queries[qq][cc]

{'suggested-queries': {'initial-query': {'query': 'person at the phone',
   'scores': {'jaccard-q': 1.0,
    'clip-similarity-cluster': 0.294,
    'clip-similarity-q': 1.0,
    'NDCG@10': 0.23365080824310536,
    'MAP': 0.2868582289545055,
    'Recall-Open-Set-@100': 0.5128205128205128,
    'Recall-Closed-Set': 0.28205128205128205}},
  'groupcap-llama3-8b-qe-few-shot-2': {'query': 'person talking on a cell phone',
   'scores': {'jaccard-q': 0.5,
    'clip-similarity-cluster': 0.291,
    'clip-similarity-q': 0.934,
    'NDCG@10': 0.1100458831490401,
    'MAP': 0.13931386869090717,
    'Recall-Open-Set-@100': 0.358974358974359,
    'Recall-Closed-Set': 0.3076923076923077}},
  'human': {'query': 'woman talking on mobile phone',
   'scores': {'jaccard-q': 0.2,
    'clip-similarity-cluster': 0.317,
    'clip-similarity-q': 0.877,
    'NDCG@10': 0.39639187290150935,
    'MAP': 0.3438731991303363,
    'Recall-Open-Set-@100': 0.5128205128205128,
    'Recall-Closed-Set': 0.5897435897435898}}}}