In [7]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
import time
import argparse
from tqdm import tqdm

import pandas as pd
import pickle
import jsonlines
import json

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np
import sys
from pathlib import Path
from FiDT5 import FiDT5

import random
from beir_eval import run_direct_rerank_eval
from beir_length_mapping import BEIR_LENGTH_MAPPING

In [8]:
def read_jsonl(path):
    data = []
    with jsonlines.open(path, 'r') as reader:
        for instance in reader:
            data.append(instance)
    return data

class ListT5Evaluator():
    def __init__(self, args):
        self.idx = 0
        self.imsi = []
        self.args = args
        self.tok = T5Tokenizer.from_pretrained(self.args.model_path)

        # For Evaluate all datasets (Using Folder as input_path)
        if not os.path.isdir(self.args.input_path):
            self.test_file = read_jsonl(self.args.input_path)
            print(f"Input path: {self.args.input_path}")
        self.idx2tokid = self.tok.encode(' '.join([str(x) for x in range(1, self.args.listwise_k+1)]))[:-1]
        self.model = self.load_model()
        self.num_forward = 0

    def write_json_file(self, path, data):
        with open(path, 'w') as f:
            json.dump(data, f, indent=4)
        print(f"Writing to {path} done!")

    def write_jsonl_file(self, path, data):
        if self.args.measure_flops:
            self.prof.stop_profile()
            self.flops = self.prof.get_total_flops()
        else:
            self.flops = 0
        # print(f"Flops: {self.flops}!")
        with jsonlines.open(path, 'w') as writer:
            writer.write_all(data)
        # print(f"Writing to {path} done!")

    def load_model(self):
        start = time.time()
        print("Loading model..")
        print(f"Loading fid model from {self.args.model_path}")
        print(f"Pooling type: {self.args.pooling_type}")
        model = FiDT5.from_pretrained(self.args.model_path,n_passages = self.args.topk, pooling_type=self.args.pooling_type,
                                       n_special_tokens=self.args.n_special_tokens, tokenizer=self.tok).to('cuda')

        end = time.time()
        print(f"Done! took {end-start} second")
        model.eval()
        if self.args.measure_flops:
            self.prof = FlopsProfiler(model)
            self.prof.start_profile()
        return model

    def make_input_tensors(self, texts):
        raw = self.tok(texts, return_tensors='pt',
                padding=self.args.padding, max_length=self.args.max_input_length,
                truncation=True).to('cuda')
        input_tensors = {'input_ids': raw['input_ids'].unsqueeze(0),
                'attention_mask': raw['attention_mask'].unsqueeze(0)}
        return input_tensors
    
    def make_listwise_text(self, question, ctxs, sep='|'):
        out = []
        if self.args.pooling_type == 'rv':
            for i in range(len(ctxs)):
                if self.args.n_special_tokens > 1:
                    special_str = "".join([f"<Relevance_{x}>" for x in range(1, 1+self.args.n_special_tokens)])
                    text = f"{special_str} | Query: {question} | Context: {ctxs[i]}"                    
                else:
                    text = f"<Relevance> | Query: {question} | Context: {ctxs[i]}"
                out.append(text)
        else:
            for i in range(len(ctxs)):
                if self.args.n_special_tokens > 1:
                    special_str = "".join([f"<extra_id_{x}>" for x in range(0, self.args.n_special_tokens)])
                    text = f"{special_str} | Query: {question} | Context: {ctxs[i]}"
                # text = f"<extra_id_17>, Query: {question}, Context: {ctxs[i]}"
                
                out.append(text)
        return out


    def run_inference(self, input_tensors):
        output = self.model.generate_by_single_logit(**input_tensors,
                                                     max_length = self.args.max_gen_length,
                                                     return_dict=False),
        self.num_forward += 1
        
        return output[0]
 
    def get_rel_index(self, output, mode='default', k=-1):
        if k == -1:
            k = self.args.out_k
        
        gen_out = None
        topk_possible = [str(x) for x in range(1, k+1)]
        
        if mode=='default':
            gen_out = self.tok.batch_decode(output.sequences, skip_special_tokens=True)
            gen_out = gen_out[0].split(' ')
        elif mode=='logit':
            topk_logits = output.scores[0].topk(k + 10).indices
            gen_out = [x.split() for x in self.tok.batch_decode(topk_logits, skip_special_tokens=True)][0]
        
        print("Model output: ", gen_out)        
        out_rel_indexes = []
        for i, x in enumerate(gen_out):
            if x in topk_possible:
                out_rel_indexes.append(x)
                topk_possible.remove(x)
        
        if len(out_rel_indexes) < k:
            if 'rev' in self.args.model_path:
                out_rel_indexes = out_rel_indexes + topk_possible
            else:    
                out_rel_indexes = topk_possible[::-1] + out_rel_indexes

        return out_rel_indexes

    def direct_rerank(self, question, ctxs, k=-1):
        full_input_texts = self.make_listwise_text(question, ctxs)
        try:
            input_tensors = self.make_input_tensors(full_input_texts)
        except:
            import IPython;
            IPython.embed()
            exit()
        output = self.run_inference(input_tensors)

        out_k_rel_index = [str(x+1) for x in output[0]]

        return out_k_rel_index
    
    def run_direct_rerank(self):
        reranked_instances = []
        len_question = []
        for instance in tqdm(self.test_file):

            question = instance[self.args.question_text_key]
            items = instance[self.args.firststage_result_key][:self.args.topk]

            if self.args.initial == 'origin':
                pass
            elif self.args.initial == 'reverse':
                items = items[::-1]
            elif self.args.initial == 'random':
                random.shuffle(items)
            topk_ctxs = [x[self.args.text_key] for x in items]
            self.model.n_passages = len(topk_ctxs)
            # self.model.encoder.encoder_batch_size = self.args.encoder_batch_size
            len_question.append(len(question))

            if len(topk_ctxs) > 0:
                index = self.direct_rerank(question, topk_ctxs, k=self.args.topk)
            else:
                # If no candidate passages are available, skip the instance
                index = []
            reranked_items = []

            for i, pid in enumerate(index):
                pid = int(pid) - 1
                template  = items[pid]
                template['orig_'+self.args.score_key] = template[self.args.score_key]
                template[self.args.score_key] = 100000 - i                

                reranked_items.append(template)
            instance[self.args.firststage_result_key] = reranked_items

            reranked_instances.append(instance)

        self.write_jsonl_file(self.args.output_path, reranked_instances)
        ndcg_k, scores = run_direct_rerank_eval(self.args.output_path, k=self.args.topk)

        if self.args.store_result:
            data_name = self.args.input_path.split('/')[-1].split('.')[0]
            result_file = f"./result_{data_name}.txt"
            log_str = ""
            log_str += f"MODEL : {self.args.model_path}\n"
            log_str += f"n_special_tokens : {self.args.n_special_tokens}\n"
            log_str += f"Token loc : {self.args.special_loc}\n"
            log_str += f"ndcg@10 : {ndcg_k}\n"
            log_str += "==================================================\n"
            with open(result_file, "a", encoding="utf-8") as f:
                f.write(log_str)
        return ndcg_k, scores

In [9]:
def run_reranker(args):
    module = ListT5Evaluator(args)

    he = time.time()
    ndcg_10, scores = module.run_direct_rerank()
    hehe = time.time()
    print(f"Total elapsed time: {hehe-he}")    
    print("Elasped time per query: ", (hehe-he)/len(module.test_file))
    if args.measure_flops:
        flops = module.flops
        num_forward = module.num_forward
    else:
        flops = 0
        num_forward = 0

    return ndcg_10, scores, flops, num_forward

In [10]:
def parse_arg(args=None):
    parser = argparse.ArgumentParser()
    # Dataset key setup
    parser.add_argument('--firststage_result_key', default='bm25_results', type=str)
    parser.add_argument('--docid_key', default='docid', type=str)
    parser.add_argument('--pid_key', default='pid', type=str)
    parser.add_argument('--qrels_key', default='qrels', type=str)
    parser.add_argument('--score_key', default='bm25_score', type=str)
    parser.add_argument('--question_text_key', default='q_text', type=str)
    parser.add_argument('--text_key', default='text', type=str)
    parser.add_argument('--title_key', default='title', type=str)
    parser.add_argument('--pooling_type', default=None, type=str)
    parser.add_argument('--n_special_tokens', default=1, type=int)
    parser.add_argument('--store_result', default=False, type=bool)
    parser.add_argument('--softmax_temp', default=1.0, type=float)
    parser.add_argument('--device', default='cuda:4', type=str) # cuda0, cuda1, cpu
    parser.add_argument('--model_path', default='Soyoung97/ListT5-base', type=str)
    parser.add_argument('--topk', default=100, type=int, help='number of initial candidate passages to consider') 
    parser.add_argument('--score_mode', default='default', type=str, help='default or logit')
    
    parser.add_argument('--max_input_length', type=int, default=-1) # depends on each individual data setup
    parser.add_argument('--padding', default='max_length', type=str)
    parser.add_argument('--listwise_k', default=5, type=int)
    parser.add_argument('--rerank_topk', default=10, type=int)
    parser.add_argument('--decoding_strategy', default='single', type=str)
    parser.add_argument('--target_seq', default='token', type=str)

    parser.add_argument('--encoder-batch-size', default=100, type=int) # Because of the memory issue, we need to Devide the input into small batch size. (max_input_length -> encoder-batch-size. 256 -> 100, 512 -> 50. 1024 -> 25 in 24GB gpu)

    parser.add_argument('--seed', default=0, type=int)
    parser.add_argument('--input_path', type=str, default='./trec-covid.jsonl')
    parser.add_argument('--output_path', type=str, default='./outputs/trec-covid.jsonl')
    parser.add_argument('--special_loc', default=0, type=int)
    # Testing positional Bias
    parser.add_argument('--initial', default='origin', type=str)

    # profiling setup
    parser.add_argument('--measure_flops', action='store_true')
    parser.add_argument('--skip_no_candidate', action='store_true', help='skip instances with no gold qrels included at first-stage retrieval for faster inference, only works when gold qrels are available')
    parser.add_argument('--skip_issubset', action='store_true', help='skip the rest of reranking when the gold qrels is a subset of reranked output for faster inference, only works when gold qrels are available')
    
    return parser.parse_args(args)

In [11]:
arguments = ['--input_path', './eval_data/baseline/dl19.jsonl',
             '--output_path', './outputs/listt5-dl19_default.jsonl',
             '--topk', '100',
             '--pooling_type', 'extra',
             '--n_special_tokens', '4',
             '--model_path', '/data/kjun/checkpoints/MVT5_v2_first/MVT5_s_11_special_4_seed_0_first_0_base_extra/tfmr_7_step2496']
args = parse_arg(arguments)
args.max_gen_length = args.topk + 1
print(args)

Namespace(firststage_result_key='bm25_results', docid_key='docid', pid_key='pid', qrels_key='qrels', score_key='bm25_score', question_text_key='q_text', text_key='text', title_key='title', pooling_type='extra', n_special_tokens=4, store_result=False, softmax_temp=1.0, device='cuda:4', model_path='/data/kjun/checkpoints/MVT5_v2_first/MVT5_s_11_special_4_seed_0_first_0_base_extra/tfmr_7_step2496', topk=100, score_mode='default', max_input_length=-1, padding='max_length', listwise_k=5, rerank_topk=10, decoding_strategy='single', target_seq='token', encoder_batch_size=100, seed=0, input_path='./eval_data/baseline/dl19.jsonl', output_path='./outputs/listt5-dl19_default.jsonl', special_loc=0, initial='origin', measure_flops=False, skip_no_candidate=False, skip_issubset=False, max_gen_length=101)


In [14]:
ndcgs_per_view_all_dl19 = []
reranked_instances_per_view_all_dl19 = []

ndcgs_per_view_all_dl20 = []
reranked_instances_per_view_all_dl20 = []

data_path = ['./eval_data/baseline/dl19.jsonl',
             './eval_data/baseline/dl20.jsonl']

# model_path = ['/home/tako/kjun/checkpoints/listt5/test/ortho_weight/10',
#               '/home/tako/kjun/checkpoints/listt5/test/ortho_weight/100',
#               '/home/tako/kjun/checkpoints/listt5/test/ortho_weight/1000',
#               '/home/tako/kjun/checkpoints/listt5/test/ortho_weight/10000']
model_path = ['/data/kjun/checkpoints/MVT5_v2_first/MVT5_s_11_special_4_seed_0_first_0_base_extra/tfmr_7_step2496']

for path in data_path:
    for model_p in model_path:
        args.input_path = path
        args.model_path = model_p
        ndcgs_per_view = []
        reranked_instances_per_view = []

        module = ListT5Evaluator(args)
        for name in BEIR_LENGTH_MAPPING:
            if name in module.args.input_path:
                module.args.max_input_length = BEIR_LENGTH_MAPPING[name]

        torch.cuda.empty_cache()    
        
        reranked_instances = []
        len_question = []
        ndcg_query = []
        for instance in tqdm(module.test_file):
            question = instance[module.args.question_text_key]
            items = instance[module.args.firststage_result_key][:module.args.topk]
            topk_ctxs = [x[module.args.text_key] for x in items]
            qrels = instance[module.args.qrels_key]
            
            module.model.n_passages = len(topk_ctxs)
            module.model.encoder.encoder_batch_size = module.args.encoder_batch_size
            len_question.append(len(question))
            
            full_input_texts = module.make_listwise_text(question, topk_ctxs)
            input_tensors = module.make_input_tensors(full_input_texts)
            
            output = module.run_inference(input_tensors)
            out_k_rel_index = [str(x+1) for x in output[0]]
            break
            reranked_items = []
            
            for i, pid in enumerate(out_k_rel_index):
                pid = int(pid) - 1
                template  = items[pid]
                template['orig_'+module.args.score_key] = template[module.args.score_key]
                template[module.args.score_key] = 100000 - i                

                reranked_items.append(template)
            instance[module.args.firststage_result_key] = reranked_items

            reranked_instances.append(instance)
            
            postfix = f"query{i+1}"
            output_path = module.args.output_path.replace('.jsonl', f'_{postfix}.jsonl')
            module.write_jsonl_file(output_path, [instance])
            ndcg_k, scores = run_direct_rerank_eval(
                # module.args.model_path,
                # module.args.input_path,
                data_path=output_path,
                k=module.args.topk
            )
            ndcg_query.append(ndcg_k)
        ndcgs_per_view.append(ndcg_query)
        reranked_instances_per_view.append(reranked_instances)
        
    if 'dl19' in path:
        ndcgs_per_view_all_dl19.append(ndcgs_per_view)
        reranked_instances_per_view_all_dl19.append(reranked_instances_per_view)
    elif 'dl20' in path:
        ndcgs_per_view_all_dl20.append(ndcgs_per_view)
        reranked_instances_per_view_all_dl20.append(reranked_instances_per_view)

Input path: ./eval_data/baseline/dl19.jsonl
Loading model..
Loading fid model from /data/kjun/checkpoints/MVT5_v2_first/MVT5_s_11_special_4_seed_0_first_0_base_extra/tfmr_7_step2496
Pooling type: extra
Done! took 2.3137171268463135 second


  0%|          | 0/43 [00:00<?, ?it/s]


Input path: ./eval_data/baseline/dl20.jsonl
Loading model..
Loading fid model from /data/kjun/checkpoints/MVT5_v2_first/MVT5_s_11_special_4_seed_0_first_0_base_extra/tfmr_7_step2496
Pooling type: extra
Done! took 2.2600021362304688 second


  0%|          | 0/54 [00:00<?, ?it/s]


In [13]:
len(ndcgs_per_view_all_dl19)

1

In [26]:
backup_reranked_instances_per_view_all_dl19 = reranked_instances_per_view_all_dl19
backup_reranked_instances_per_view_all_dl20 = reranked_instances_per_view_all_dl20
backup_ndcgs_per_view_all_dl19 = ndcgs_per_view_all_dl19
backup_ndcgs_per_view_all_dl20 = ndcgs_per_view_all_dl20

In [28]:
#ndcgs_per_view to df
# list to ndarray
ndcgs_df_dl19 = []
for i in range(len(ndcgs_per_view_all_dl19)):
    ndcgs_per_view = np.array(ndcgs_per_view_all_dl19[i])
    ndcgs_df = pd.DataFrame(ndcgs_per_view.transpose(), columns=['view0', 'view1', 'view2', 'view3'])
    ndcgs_df
    # row 별 평균과 표준편차 계산
    ndcgs_df['mean'] = ndcgs_df.mean(axis=1)
    ndcgs_df['std'] = ndcgs_df.std(axis=1)
    ndcgs_df_dl19.append(ndcgs_df)

ndcgs_df_dl20 = []

for i in range(len(ndcgs_per_view_all_dl20)):
    ndcgs_per_view = np.array(ndcgs_per_view_all_dl20[i])
    ndcgs_df = pd.DataFrame(ndcgs_per_view.transpose(), columns=['view0', 'view1', 'view2', 'view3'])
    ndcgs_df
    # row 별 평균과 표준편차 계산
    ndcgs_df['mean'] = ndcgs_df.mean(axis=1)
    ndcgs_df['std'] = ndcgs_df.std(axis=1)
    ndcgs_df_dl20.append(ndcgs_df)


In [32]:
start = 10
for i in range(len(ndcgs_df_dl19)):
    ndcgs_df_19_max = ndcgs_df_dl19[i].assign(max=ndcgs_df_dl19[i].iloc[:,:4].max(axis=1))
    ndcgs_df_20_max = ndcgs_df_dl20[i].assign(max=ndcgs_df_dl20[i].iloc[:,:4].max(axis=1))

    ndcgs_df_19_max_mean = ndcgs_df_19_max['max'].mean().round(4)
    ndcgs_df_20_max_mean = ndcgs_df_20_max['max'].mean().round(4)

    print(f'ORACLE Weight {start} NDCG@10: {ndcgs_df_19_max_mean}, {ndcgs_df_20_max_mean}')

    start = start * 10
    # ndcgs_df.to_csv(f'./outputs/ndcgs_dl19_{i}.csv', index=False)

ORACLE Weight 10 NDCG@10: 0.7536, 0.72
ORACLE Weight 100 NDCG@10: 0.7528, 0.7286
ORACLE Weight 1000 NDCG@10: 0.7539, 0.7298
ORACLE Weight 10000 NDCG@10: 0.7552, 0.7303


In [33]:
# 각 row에서 max인 cell 의 value 추출
ndcgs_df_max = ndcgs_df_dl20[3].assign(max=ndcgs_df.iloc[:, :4].max(axis=1))
ndcgs_df_max['max'].mean()

0.7302842592592592

In [34]:
# soft by std
ndcgs_df = ndcgs_df_dl20[3].sort_values('std', ascending=False)
ndcgs_df

Unnamed: 0,view0,view1,view2,view3,mean,std
51,0.75451,0.86365,0.68322,0.74486,0.76156,0.06497714
40,0.67139,0.70392,0.81265,0.80704,0.74875,0.06219974
14,0.83667,0.73327,0.9005,0.79548,0.81648,0.06089315
3,0.69983,0.58966,0.59403,0.65549,0.634752,0.04570771
11,0.77724,0.69963,0.74705,0.67989,0.725952,0.03837362
52,0.22709,0.19697,0.24752,0.29634,0.24198,0.03617082
1,0.75429,0.77023,0.84117,0.77199,0.78442,0.03348224
15,0.83466,0.74789,0.79026,0.78343,0.78906,0.03085233
31,0.77437,0.82925,0.84676,0.84745,0.824458,0.02982357
37,0.83622,0.82289,0.76148,0.82535,0.811485,0.02930287


In [106]:
ndcgs_df = ndcgs_df.sort_index()
ndcgs_df
# Store the result

ndcgs_df.to_csv(f'./outputs/ndcgs_dl20.csv', index=False)

In [35]:
import pandas as pd

# 예시: ndcgs_df라는 이름의 DataFrame이 있고, 그 안에
# 'view0', 'view1', 'view2', 'view3' 열이 존재한다고 가정합니다.

# 각 행에서 최대값을 가지는 열 이름을 찾은 뒤, 그 빈도를 집계
max_counts = ndcgs_df[['view0', 'view1', 'view2', 'view3']] \
    .idxmax(axis=1) \
    .value_counts()

print(max_counts)


view0    25
view2    11
view3    10
view1     8
Name: count, dtype: int64


In [45]:
arguments = ['--input_path', './eval_data/baseline/dl20.jsonl',
             '--output_path', './outputs/listt5-dl20_default.jsonl',
             '--topk', '100',
             '--pooling_type', 'rv',
             '--n_special_tokens', '4',
             '--model_path', '/home/tako/kjun/checkpoints/temp/0.5_0.0/tfmr_7_step2496']
args = parse_arg(arguments)
args.max_gen_length = args.topk + 1
print(args)

model_path = ['/home/tako/kjun/checkpoints/listt5/test/ortho_weight/10000']

ranking_score_all = []
lhs_all = []
psg_emb_all = []

for path in model_path:
    args.model_path = path
    module = ListT5Evaluator(args)
    for name in BEIR_LENGTH_MAPPING:
        if name in module.args.input_path:
            module.args.max_input_length = BEIR_LENGTH_MAPPING[name]

    torch.cuda.empty_cache()    

    ranking_score = []
    lhs = []
    psg_emb = []
    for instance in tqdm(module.test_file):
        question = instance[module.args.question_text_key]
        items = instance[module.args.firststage_result_key][:module.args.topk]
        topk_ctxs = [x[module.args.text_key] for x in items]
        qrels = instance[module.args.qrels_key]
        
        module.model.n_passages = len(topk_ctxs)
        module.model.encoder.encoder_batch_size = module.args.encoder_batch_size
        # len_question.append(len(question))
        
        full_input_texts = module.make_listwise_text(question, topk_ctxs)
        input_tensors = module.make_input_tensors(full_input_texts)
        outputs = module.model.forward(input_ids=input_tensors['input_ids'], attention_mask=input_tensors['attention_mask'],)
        ranking_score.append(outputs.ranking.cpu())
        lhs.append(outputs.last_hidden_state.cpu())
        psg_emb.append(outputs.passage_embed.cpu())
    
    ranking_score_all.append(torch.stack(ranking_score, dim=0))
    lhs_all.append(torch.stack(lhs, dim=0))
    psg_emb_all.append(torch.stack(psg_emb, dim=0))
    

Namespace(firststage_result_key='bm25_results', docid_key='docid', pid_key='pid', qrels_key='qrels', score_key='bm25_score', question_text_key='q_text', text_key='text', title_key='title', pooling_type='rv', n_special_tokens=4, store_result=False, softmax_temp=1.0, device='cuda:4', model_path='/home/tako/kjun/checkpoints/temp/0.5_0.0/tfmr_7_step2496', topk=100, score_mode='default', max_input_length=-1, padding='max_length', listwise_k=5, rerank_topk=10, decoding_strategy='single', target_seq='token', encoder_batch_size=100, seed=0, input_path='./eval_data/baseline/dl20.jsonl', output_path='./outputs/listt5-dl20_default.jsonl', special_loc=0, initial='origin', measure_flops=False, skip_no_candidate=False, skip_issubset=False, max_gen_length=101)
Input path: ./eval_data/baseline/dl20.jsonl
Loading model..
Loading fid model from /home/tako/kjun/checkpoints/listt5/test/ortho_weight/10000
Pooling type: rv
Done! took 3.4574882984161377 second


100%|██████████| 54/54 [00:21<00:00,  2.57it/s]


In [46]:
ranking_score = ranking_score_all[0]
ranking_score.shape

torch.Size([54, 4, 1, 100])

In [108]:
store_result = ranking_score.squeeze(2).cpu().numpy()
store_result.shape


(54, 4, 100)

In [109]:
# Save the result

with open('./outputs/ranking_score.pkl', 'wb') as f:
    pickle.dump(store_result, f)


In [79]:
top10_values, top10_indices = torch.topk(ranking_score, k=10, dim=-1)

In [86]:
ranking_score.shape

torch.Size([54, 4, 1, 100])

In [87]:
# min-max scaling
min_val = ranking_score.min(dim=-1, keepdim=True).values
max_val = ranking_score.max(dim=-1, keepdim=True).values

ranking_score_minmax = (ranking_score - min_val) / (max_val - min_val + 1e-8)


In [93]:
ranking_score_minmax[0].max(dim=-1), ranking_score_minmax[0].min(dim=-1)

(torch.return_types.max(
 values=tensor([[1.],
         [1.],
         [1.],
         [1.]]),
 indices=tensor([[44],
         [44],
         [44],
         [44]])),
 torch.return_types.min(
 values=tensor([[0.],
         [0.],
         [0.],
         [0.]]),
 indices=tensor([[98],
         [98],
         [93],
         [85]])))

In [105]:
topk_indices, topk_values = torch.topk(ranking_score_minmax, k=10, dim=-1)


import torch.nn.functional as F

query_max = []
query_min = []

for i in range(topk_values.size(0)):
    prob = F.softmax(top10_values[i], dim=-1)  # shape = [4, 10]
    entropy = -torch.sum(prob * torch.log2(prob), dim=-1)  # shape = [4]
    max_idx = torch.argmax(entropy)
    min_idx = torch.argmin(entropy)
    print(f"{i}\t{max_idx}\t{min_idx}")
    query_max.append(max_idx)
    query_min.append(min_idx)

# print("각 샘플별 Entropy:", entropy)

0	2	1
1	2	1
2	0	1
3	3	1
4	2	1
5	2	1
6	2	1
7	2	1
8	2	1
9	2	1
10	2	1
11	2	1
12	2	1
13	2	1
14	2	1
15	2	1
16	2	1
17	2	1
18	2	1
19	2	1
20	2	1
21	2	1
22	2	1
23	2	1
24	2	1
25	2	1
26	2	1
27	2	1
28	2	1
29	0	1
30	2	1
31	2	1
32	2	1
33	2	1
34	2	1
35	2	1
36	2	1
37	2	1
38	2	1
39	2	1
40	2	1
41	2	1
42	2	1
43	2	1
44	2	1
45	2	1
46	2	1
47	3	1
48	2	1
49	2	1
50	2	1
51	2	1
52	2	1
53	2	1


In [116]:
import pickle
import pandas as pd
import numpy as np

# (1) ndcgs_dl20.csv 불러오기
df_ndcg_tmp = pd.read_csv('outputs/ndcgs_dl20.csv', header=0, usecols=[0, 1, 2, 3])  
#  -> csv의 구조에 따라 header가 있는 경우 header=0으로 변경
#  -> df_ndcg.shape가 (54, 4)라 가정

# (2) ranking_score.pkl 불러오기
with open('outputs/ranking_score.pkl', 'rb') as f:
    ranking_score_tmp = pickle.load(f)  # shape = (54, 4, 100)이라 가정

# (3) ranking_score의 요약 스칼라 만들기 (예: 평균)
#     결과적으로 shape = (54, 4)가 됨
mean_scores_tmp = ranking_score_tmp.mean(axis=2)   # axis=2 => 문서 차원을 평균
# mean_scores[q, v] => q번째 Query, v번째 View의 "평균 점수"

# (4) 2차원 데이터를 1차원으로 reshape
#     ndcg_values와 mean_values 모두 (54*4,) 형태가 됨
ndcg_values_tmp = df_ndcg_tmp.values.reshape(-1)        # shape=(216,)
mean_values_tmp = mean_scores_tmp.reshape(-1)           # shape=(216,)

# (5) 상관계수 계산 (피어슨, 스피어먼)
pearson_corr = np.corrcoef(mean_values_tmp, ndcg_values_tmp)[0, 1]

# 스피어먼 상관은 scipy.stats.spearmanr 등을 사용할 수 있음
# !pip install scipy
from scipy.stats import spearmanr
spearman_corr, _ = spearmanr(mean_values_tmp, ndcg_values_tmp)

print("평균 점수 vs nDCG Pearson 상관계수:", pearson_corr)
print("평균 점수 vs nDCG Spearman 상관계수:", spearman_corr)


평균 점수 vs nDCG Pearson 상관계수: 0.3909180881856023
평균 점수 vs nDCG Spearman 상관계수: 0.4226057714323844


In [115]:
df_ndcg_tmp.shape

(55, 4)

In [119]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

# (1) ndcgs_dl20.csv 불러오기 (형태: (54, 4) 가정)
df_ndcg_tmp = pd.read_csv('outputs/ndcgs_dl20.csv', header=0, usecols=[0, 1, 2, 3])  
# 불필요한 mean/std 열이 있다면 drop or usecols 등으로 (54, 4)만 남기기
# 예: df_ndcgs = df_ndcgs.drop(columns=["mean","std"])  # 열 이름이 mean, std인 경우
ndcg_array = df_ndcg_tmp.values  # shape (54, 4)

# (2) ranking_score.pkl 불러오기 (shape: (54, 4, 100))
import pickle
with open('outputs/ranking_score.pkl', 'rb') as f:
    ranking_score_tmp = pickle.load(f)

print("ndcg_array shape:", ndcg_array.shape)  # (54, 4) 확인
print("ranking_score shape:", ranking_score_tmp.shape)  # (54, 4, 100) 확인

# (3) 상위 K개만 집중
K = 10

# 아래 로직: 문서 점수를 내림차순으로 정렬한 뒤 앞쪽 K개를 선택
# np.sort(axis=-1)는 오름차순 정렬이므로, 뒤쪽 K개를 가져오거나, -x로 정렬하는 방식을 사용
# 여기서는 오름차순 정렬 후 마지막 K개를 가져오는 예시

# axis=2(마지막 축) 기준으로 오름차순 정렬
sorted_scores = np.sort(ranking_score_tmp, axis=2)  # shape 동일 (54, 4, 100) 
# sorted_scores[q, v, :]는 해당 (q,v)의 모든 문서를 낮은 점수->높은 점수 순으로 정렬

# 마지막 K개(가장 높은 K개) 추출
topk_scores = sorted_scores[:, :, -K:]  # shape=(54,4,K)

# (4) topK 점수 요약(예: 합, 평균, 최댓값 등)
topk_sum = topk_scores.sum(axis=2)     # shape=(54,4)  (K개 합)
topk_mean = topk_scores.mean(axis=2)   # shape=(54,4)  (K개 평균)
topk_max = topk_scores.max(axis=2)     # shape=(54,4)  (K개 중 최댓값)

# 필요에 따라 한 가지만 사용할 수도 있고, 여러 개를 비교해 볼 수도 있음.
# 여기서는 topk_sum를 예시로 들어서 nDCG와 상관관계 계산

sum_values = topk_sum.reshape(-1)        # (54*4=216,)
ndcg_values = ndcg_array.reshape(-1)     # (216,)

# (5) 상관계수 계산
pearson_corr = np.corrcoef(sum_values, ndcg_values)[0, 1]
spearman_corr, _ = spearmanr(sum_values, ndcg_values)

print("[Top-K Sum] Pearson 상관계수:", pearson_corr)
print("[Top-K Sum] Spearman 상관계수:", spearman_corr)

# (6) 필요하면 mean, max도 똑같이 해볼 수 있음
mean_values = topk_mean.reshape(-1)
max_values = topk_max.reshape(-1)

pearson_corr_mean = np.corrcoef(mean_values, ndcg_values)[0, 1]
spearman_corr_mean, _ = spearmanr(mean_values, ndcg_values)

print("[Top-K Mean] Pearson 상관계수:", pearson_corr_mean)
print("[Top-K Mean] Spearman 상관계수:", spearman_corr_mean)

pearson_corr_max = np.corrcoef(max_values, ndcg_values)[0, 1]
spearman_corr_max, _ = spearmanr(max_values, ndcg_values)

print("[Top-K Max] Pearson 상관계수:", pearson_corr_max)
print("[Top-K Max] Spearman 상관계수:", spearman_corr_max)


ndcg_array shape: (54, 4)
ranking_score shape: (54, 4, 100)
[Top-K Sum] Pearson 상관계수: 0.48608485138684937
[Top-K Sum] Spearman 상관계수: 0.5076618833006824
[Top-K Mean] Pearson 상관계수: 0.48608485128822115
[Top-K Mean] Spearman 상관계수: 0.5076618833006824
[Top-K Max] Pearson 상관계수: 0.18103478577259868
[Top-K Max] Spearman 상관계수: 0.22675615726941933


In [120]:
import numpy as np
import pandas as pd
import pickle
from scipy.stats import spearmanr

# (1) nDCG 정보 불러오기 (CSV)
df_ndcg_tmp = pd.read_csv('outputs/ndcgs_dl20.csv', header=0, usecols=[0, 1, 2, 3])  
# 만약 mean/std 열이 있으면 drop 등으로 (54,4)만 남기세요.
# 예: df_ndcgs = df_ndcgs.drop(columns=['mean','std'])  # 열 이름이 mean, std 인 경우
ndcg_array = df_ndcg_tmp.values  # shape (54, 4) 가정

# (2) 문서 점수(ranking_score) 불러오기
with open('outputs/ranking_score.pkl', 'rb') as f:
    ranking_score_temp = pickle.load(f)  # shape (54, 4, 100)

print("ndcg_array shape:", ndcg_array.shape)        # (54, 4)
print("ranking_score shape:", ranking_score_temp.shape)  # (54, 4, 100)

# (3) 상위 K개 설정
K = 10

# 오름차순 정렬 후 마지막 K개가 "가장 높은 점수 K개"
sorted_scores = np.sort(ranking_score_temp, axis=2)   # shape (54,4,100)
topk_scores = sorted_scores[:, :, -K:]           # shape (54,4,K)

# (4) 상위 K개 점수 합
topk_sum = topk_scores.sum(axis=2)  # shape (54,4)

# (5) Query별로 "Top-K 합이 가장 큰 View"를 선택
best_view_pred = topk_sum.argmax(axis=1)  # shape (54,)

# (6) 선택한 View의 nDCG를 모아서 평균
selected_ndcgs = []
for q in range(topk_sum.shape[0]):  # 54개 Query
    v_pred = best_view_pred[q]
    selected_ndcgs.append(ndcg_array[q, v_pred])

mean_ndcg = np.mean(selected_ndcgs)
print(f"[Top-K Sum 기반 View 선택] 평균 nDCG: {mean_ndcg:.4f}")


ndcg_array shape: (54, 4)
ranking_score shape: (54, 4, 100)
[Top-K Sum 기반 View 선택] 평균 nDCG: 0.7062


In [123]:
lhs_all[0].shape, psg_emb_all[0].shape

(torch.Size([54, 4, 1, 768]), torch.Size([54, 4, 100, 768]))

In [129]:
test = lhs_all[0][0]
print(test.norm(dim=-1))
print(psg_emb_all[0][0].norm(dim=-1).mean(dim=-1))

tensor([[12.7543],
        [20.2297],
        [14.9376],
        [11.3495]])
tensor([2.0990, 2.2739, 1.9247, 2.0057])


In [146]:
view_max_indices = []
for i in range(ranking_score_all[0].size(0)):
    # Print MAX & MiN Indices of LOGITS

    max_index = ranking_score_all[0][i].max(dim=-1).indices
    min_index = ranking_score_all[0][i].min(dim=-1).indices
    print(f"Query {i+1} - Max Index: {max_index}")
    view_max_indices.append(max_index)

    # print(f"Query {i+1} - Max: {ranking_score_all[0][i].max()}, Min: {ranking_score_all[0][i].min()}")

Query 1 - Max Index: tensor([[44],
        [44],
        [44],
        [44]])
Query 2 - Max Index: tensor([[25],
        [25],
        [25],
        [25]])
Query 3 - Max Index: tensor([[11],
        [11],
        [55],
        [55]])
Query 4 - Max Index: tensor([[5],
        [0],
        [0],
        [0]])
Query 5 - Max Index: tensor([[0],
        [0],
        [0],
        [0]])
Query 6 - Max Index: tensor([[0],
        [0],
        [0],
        [0]])
Query 7 - Max Index: tensor([[24],
        [24],
        [24],
        [24]])
Query 8 - Max Index: tensor([[0],
        [0],
        [0],
        [0]])
Query 9 - Max Index: tensor([[0],
        [0],
        [0],
        [0]])
Query 10 - Max Index: tensor([[2],
        [3],
        [2],
        [2]])
Query 11 - Max Index: tensor([[10],
        [10],
        [10],
        [10]])
Query 12 - Max Index: tensor([[3],
        [3],
        [3],
        [3]])
Query 13 - Max Index: tensor([[1],
        [3],
        [1],
        [1]])
Query 14 - Max

In [141]:
test = ranking_score_all[0].sum(dim=1)
print(test.shape)

torch.Size([54, 1, 100])


In [147]:
for i in range(test.size(0)):
    # Print MAX & MiN Indices of LOGITS
    max_index = test[i].max(dim=-1).indices
    # min_index = test[i].min(dim=-1).indices
    print(f"Query {i+1} - Max Index: {max_index}")
    view_max_indices.append(max_index)
    # print(f"Query {i+1} - Max: {ranking_score_all[0][i].max()}, Min: {ranking_score_all[0][i].min()}")

Query 1 - Max Index: tensor([44])
Query 2 - Max Index: tensor([25])
Query 3 - Max Index: tensor([55])
Query 4 - Max Index: tensor([0])
Query 5 - Max Index: tensor([0])
Query 6 - Max Index: tensor([0])
Query 7 - Max Index: tensor([24])
Query 8 - Max Index: tensor([0])
Query 9 - Max Index: tensor([0])
Query 10 - Max Index: tensor([2])
Query 11 - Max Index: tensor([10])
Query 12 - Max Index: tensor([3])
Query 13 - Max Index: tensor([1])
Query 14 - Max Index: tensor([40])
Query 15 - Max Index: tensor([3])
Query 16 - Max Index: tensor([68])
Query 17 - Max Index: tensor([4])
Query 18 - Max Index: tensor([7])
Query 19 - Max Index: tensor([7])
Query 20 - Max Index: tensor([2])
Query 21 - Max Index: tensor([2])
Query 22 - Max Index: tensor([8])
Query 23 - Max Index: tensor([87])
Query 24 - Max Index: tensor([0])
Query 25 - Max Index: tensor([34])
Query 26 - Max Index: tensor([4])
Query 27 - Max Index: tensor([29])
Query 28 - Max Index: tensor([6])
Query 29 - Max Index: tensor([1])
Query 30 - Ma

In [150]:
view_max_indices_tensor = torch.stack(view_max_indices[:54], dim=0)

In [152]:
sum_max_indices_tensor = torch.stack(view_max_indices[54:], dim=0)

In [153]:
view_max_indices_tensor.shape, sum_max_indices_tensor.shape

(torch.Size([54, 4, 1]), torch.Size([54, 1]))

In [154]:
# view_max_indices_tensor: [54, 4, 1] → squeeze to [54, 4]
view_max = view_max_indices_tensor.squeeze(-1)  # [54, 4]

# sum_max_indices_tensor: [54, 1]
sum_max = sum_max_indices_tensor  # [54, 1]

# concat along the last dimension (dim=1)
combined = torch.cat([view_max, sum_max], dim=1)  # [54, 5]

# 보기 좋게 출력하려면
import pandas as pd

df = pd.DataFrame(combined.cpu().numpy(), columns=["View1", "View2", "View3", "View4", "SUM"])
print(df)


    View1  View2  View3  View4  SUM
0      44     44     44     44   44
1      25     25     25     25   25
2      11     11     55     55   55
3       5      0      0      0    0
4       0      0      0      0    0
5       0      0      0      0    0
6      24     24     24     24   24
7       0      0      0      0    0
8       0      0      0      0    0
9       2      3      2      2    2
10     10     10     10     10   10
11      3      3      3      3    3
12      1      3      1      1    1
13     40     40     40     40   40
14     35     35      3      3    3
15     68     68     68     68   68
16      4     14      4      4    4
17      7      9      7      7    7
18      7      7      7      7    7
19      2      2      2      1    2
20      2      2      2      2    2
21      7      8      8      8    8
22     87     87     89     87   87
23      1      0      0      0    0
24     34     34     34     34   34
25      4      4      4      4    4
26     29     29     29     

In [155]:
arguments = ['--input_path', './eval_data/baseline/dl20.jsonl',
             '--output_path', './outputs/listt5-dl20_default.jsonl',
             '--topk', '100',
             '--pooling_type', 'rv',
             '--n_special_tokens', '4',
             '--model_path', '/home/tako/kjun/checkpoints/temp/0.5_0.0/tfmr_7_step2496']
args = parse_arg(arguments)
args.max_gen_length = args.topk + 1
print(args)

model_path = ['/home/tako/kjun/checkpoints/temp/tfmr_0_step25000']

ranking_score_all_2 = []
lhs_all_2 = []
psg_emb_all_2 = []

for path in model_path:
    args.model_path = path
    module = ListT5Evaluator(args)
    for name in BEIR_LENGTH_MAPPING:
        if name in module.args.input_path:
            module.args.max_input_length = BEIR_LENGTH_MAPPING[name]

    torch.cuda.empty_cache()    

    ranking_score = []
    lhs = []
    psg_emb = []
    for instance in tqdm(module.test_file):
        question = instance[module.args.question_text_key]
        items = instance[module.args.firststage_result_key][:module.args.topk]
        topk_ctxs = [x[module.args.text_key] for x in items]
        qrels = instance[module.args.qrels_key]
        
        module.model.n_passages = len(topk_ctxs)
        module.model.encoder.encoder_batch_size = module.args.encoder_batch_size
        # len_question.append(len(question))
        
        full_input_texts = module.make_listwise_text(question, topk_ctxs)
        input_tensors = module.make_input_tensors(full_input_texts)
        outputs = module.model.forward(input_ids=input_tensors['input_ids'], attention_mask=input_tensors['attention_mask'],)
        ranking_score.append(outputs.ranking.cpu())
        lhs.append(outputs.last_hidden_state.cpu())
        psg_emb.append(outputs.passage_embed.cpu())
    
    ranking_score_all_2.append(torch.stack(ranking_score, dim=0))
    lhs_all_2.append(torch.stack(lhs, dim=0))
    psg_emb_all_2.append(torch.stack(psg_emb, dim=0))
    

Namespace(firststage_result_key='bm25_results', docid_key='docid', pid_key='pid', qrels_key='qrels', score_key='bm25_score', question_text_key='q_text', text_key='text', title_key='title', pooling_type='rv', n_special_tokens=4, store_result=False, softmax_temp=1.0, device='cuda:4', model_path='/home/tako/kjun/checkpoints/temp/0.5_0.0/tfmr_7_step2496', topk=100, score_mode='default', max_input_length=-1, padding='max_length', listwise_k=5, rerank_topk=10, decoding_strategy='single', target_seq='token', encoder_batch_size=100, seed=0, input_path='./eval_data/baseline/dl20.jsonl', output_path='./outputs/listt5-dl20_default.jsonl', special_loc=0, initial='origin', measure_flops=False, skip_no_candidate=False, skip_issubset=False, max_gen_length=101)
Input path: ./eval_data/baseline/dl20.jsonl
Loading model..
Loading fid model from /home/tako/kjun/checkpoints/temp/tfmr_0_step25000
Pooling type: rv
Done! took 3.4578142166137695 second


100%|██████████| 54/54 [00:21<00:00,  2.57it/s]


In [156]:
view_max_indices = []
for i in range(ranking_score_all_2[0].size(0)):
    # Print MAX & MiN Indices of LOGITS

    max_index = ranking_score_all_2[0][i].max(dim=-1).indices
    min_index = ranking_score_all_2[0][i].min(dim=-1).indices
    # print(f"Query {i+1} - Max Index: {max_index}")
    view_max_indices.append(max_index)

    # print(f"Query {i+1} - Max: {ranking_score_all_2[0][i].max()}, Min: {ranking_score_all_2[0][i].min()}")

test = ranking_score_all_2[0].sum(dim=1)
# print(test.shape) 
for i in range(test.size(0)):
    # Print MAX & MiN Indices of LOGITS
    max_index = test[i].max(dim=-1).indices
    # min_index = test[i].min(dim=-1).indices
    # print(f"Query {i+1} - Max Index: {max_index}")
    view_max_indices.append(max_index)
    # print(f"Query {i+1} - Max: {ranking_score_all_2[0][i].max()}, Min: {ranking_score_all[0][i].min()}")


view_max_indices_tensor = torch.stack(view_max_indices[:54], dim=0)
sum_max_indices_tensor = torch.stack(view_max_indices[54:], dim=0)


# view_max_indices_tensor: [54, 4, 1] → squeeze to [54, 4]
view_max = view_max_indices_tensor.squeeze(-1)  # [54, 4]

# sum_max_indices_tensor: [54, 1]
sum_max = sum_max_indices_tensor  # [54, 1]

# concat along the last dimension (dim=1)
combined = torch.cat([view_max, sum_max], dim=1)  # [54, 5]

# 보기 좋게 출력하려면
import pandas as pd

df = pd.DataFrame(combined.cpu().numpy(), columns=["View1", "View2", "View3", "View4", "SUM"])
print(df)


    View1  View2  View3  View4  SUM
0      44     13     13     13   13
1      25     25     25     25   25
2      35     30     55     55   55
3       5      0      0      0    5
4       0      0      0      0    0
5      23      0      0     49    0
6      24     24     24     24   24
7       0      0      0      0    0
8       0      0      0      0    0
9       2      3      2      2    2
10      2      0      0      0    0
11      3      3     13      3    3
12      1      2     11      6    1
13      1     40     40     40   40
14     35     35     37     28   35
15     68     68     68     68   68
16      4      4      0      4    4
17      7      9      0      7    7
18     15      7      7      7    7
19      0      0      5      0    0
20      2      4      0      2    2
21      7      1     13     13    7
22     30     89     30     13   30
23      0      0      0      0    0
24     63     34     99     63   63
25      4      4      4      4    4
26     29     29     29     