In [1]:
import os
base_path = "/home/mshlis/Projects/RIET/DynamicQuery"
os.chdir(base_path)

from dynamicquery import utils
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

In [2]:
class Clef2021RerankedDataset(TensorDataset):
    def __init__(self, 
                 encode_fn, 
                 claims, 
                 tweets, 
                 connections,
                 claim_embeddings,
                 ranks):
        self.claim_embeddings = claim_embeddings
        self.ranks = ranks
        run_tweets = tweets.join(connections.set_index("tweet_id"), on="id", how="inner")
        run_tweets = run_tweets.join(claims.set_index("vclaim_id"), on="claim_id", how="inner")
        run_tweets = run_tweets[["tweet", "vclaim"]].reset_index()
        run_tweets["encoded_tweet"] = run_tweets.tweet.apply(encode_fn)
        self.claim_idx = [claims.vclaim.to_list().index(t_claim) for t_claim in run_tweets.vclaim.to_list()]
        self.data = run_tweets
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        Xt = self.data.encoded_tweet[index]
        Xt = (np.array(Xt["input_ids"]), np.array(Xt["attention_mask"]))
        Xe = self.claim_embeddings[self.ranks[index]]
        # Ye = self.claim_embeddings[self.claim_idx[index:index+1]]
        return (Xt, Xe)
    
    
def get_clef2021_reranked_dataloader(encode_fn, 
                            claims, 
                            tweets, 
                            connections, 
                            claim_embeddings,
                            ranks,
                            params={'batch_size':32, 'shuffle':True}):
    dataset = Clef2021RerankedDataset(encode_fn, 
                              claims, 
                              tweets, 
                              connections, 
                              claim_embeddings,
                              ranks)
    return DataLoader(dataset, **params)        

In [3]:
exp_path = "./experiments/candidate_selection/finetune_st5_large_claims_negs"
train_neg_path = os.path.join(exp_path, "ranks_train.npy")
dev_neg_path = os.path.join(exp_path, "ranks_dev.npy")
emb_path = os.path.join(exp_path, "claim_embs.npy")
tweet_emb_path = os.path.join(exp_path, "tweet_embs.npy")

In [4]:
import transformers
os.getcwd(), transformers.__version__

('/home/mshlis/Projects/RIET/DynamicQuery', '4.17.0')

In [5]:
neg_ids = np.load(train_neg_path)
dev_neg_ids = np.load(dev_neg_path)
neg_embs = np.load(emb_path)
tweet_embs = np.load(tweet_emb_path, allow_pickle=True)

In [6]:
neg_ids.shape, neg_embs.shape

((999, 13825), (13825, 768))

In [7]:
import extended_roberta_v2 as roberta
from transformers import AutoTokenizer
from functools import partial
import importlib
importlib.reload(roberta)

MAX_LENGTH = 192

model_str = "roberta-base"
model = roberta.ExtendedRobertaForExternalClassification.from_pretrained(model_str)
model.load_state_dict(torch.load("./experiments/cross_query/base_rndm_large_neg_v2/trained_model.pt"))
tokenizer = AutoTokenizer.from_pretrained(model_str)
tokenize = partial(tokenizer, **dict(
    truncation=True, 
    max_length=MAX_LENGTH, 
    padding="max_length", 
    return_attention_mask=True
))

Some weights of the model checkpoint at roberta-base were not used when initializing ExtendedRobertaForExternalClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ExtendedRobertaForExternalClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.adapter_layer.bias', 'classifier.weight', 'roberta.adapt

In [8]:
import re
def is_adapter(name): 
    check_v1 = re.match("roberta.encoder\.adapter_layer\.", name)
    check_v2 = name == "roberta.adapter_layer"
    return check_v1 or check_v2

for name, param in model.named_modules():
    if is_adapter(name):
        print(name)

roberta.adapter_layer


In [9]:
# Claim Data
tweets, test_tweets = utils.get_tweets()
test_tweets = test_tweets[1:]
train_conns, dev_conns, test_conns = utils.get_qrels()
claims = utils.get_claims()

In [10]:
# (dev_neg_ids[:, 0] == claim_idx[:]).mean()

In [11]:
import dataloaders
importlib.reload(dataloaders)

BATCH_SIZE = 32

train_dl = dataloaders.get_clef2021_reranked_eval_dataloader(
    tokenize, 
    claims, 
    tweets, 
    train_conns,
    neg_embs,
    neg_ids[:,:5],
    tweet_embs[()]["train"],
    params={'batch_size':BATCH_SIZE, 'shuffle':False})

dev_dl = dataloaders.get_clef2021_reranked_eval_dataloader(
    tokenize, 
    claims, 
    tweets, 
    dev_conns,
    neg_embs,
    dev_neg_ids[:,:5],
    tweet_embs[()]["dev"],
    params={'batch_size':BATCH_SIZE, 'shuffle':False}) 

In [12]:
model.eval()

probits = []
for inputs, external_inputs in train_dl:
    inpt_dict = {
        "input_ids": inputs[0],
        "attention_mask": inputs[1],
        "extended_states": external_inputs,
    }
    with torch.no_grad():
        out = model(**inpt_dict)
        _probits = torch.nn.functional.softmax(out.logits[:,:-1], dim=-1)
    probits.append(_probits.detach().numpy())
    
probits = np.concatenate(probits, 0)

In [14]:
dev_probits = []
for inputs, external_inputs in dev_dl:
    inpt_dict = {
        "input_ids": inputs[0],
        "attention_mask": inputs[1],
        "extended_states": external_inputs
    }
    with torch.no_grad():
        out = model(**inpt_dict)
        _probits = torch.nn.functional.softmax(out.logits[:,:-1], dim=-1)
    dev_probits.append(_probits.detach().numpy())
    
dev_probits = np.concatenate(dev_probits, 0)

In [16]:
reranks = probits.argsort()[:,::-1]
dev_reranks = dev_probits.argsort()[:,::-1]

In [17]:
probits.shape

(999, 5)

In [18]:
print(reranks[:5])
print(dev_reranks[:5])

[[1 2 3 4 0]
 [0 4 1 3 2]
 [0 2 4 3 1]
 [0 2 1 4 3]
 [3 0 4 1 2]]
[[2 0 3 1 4]
 [0 4 1 2 3]
 [0 1 4 2 3]
 [4 0 1 2 3]
 [0 2 1 3 4]]


In [24]:
print(reranks[:5])
print(dev_reranks[:5])

[[1 2 3 4 0]
 [0 4 1 3 2]
 [0 2 4 3 1]
 [0 2 1 4 3]
 [3 0 4 1 2]]
[[2 0 3 1 4]
 [0 4 1 2 3]
 [0 1 4 2 3]
 [4 0 1 2 3]
 [0 2 1 3 4]]


In [20]:
def get_idx(connections, claims, tweets):
    run_tweets = tweets.join(connections.set_index("tweet_id"), on="id", how="inner")
    run_tweets = run_tweets.join(claims.set_index("vclaim_id"), on="claim_id", how="inner")
    run_tweets = run_tweets[["tweet", "vclaim"]].reset_index()
    claim_idx = [claims.vclaim.to_list().index(t_claim) for t_claim in run_tweets.vclaim.to_list()]
    return run_tweets, claim_idx

def avg_prec(gold, rankings, n):
    is_rel = (np.array(rankings)[:n] == gold).astype(float)
    return (is_rel/np.arange(1,n+1)).sum()

def recall(gold, rankings, n):
    is_rel = (np.array(rankings)[:n] == gold).astype(float)
    return is_rel.sum()

def mean_avg_prec(golds, rankings, n):
    avg_precs = [avg_prec(gold, rlist, n) for gold, rlist in zip(golds, rankings)]
    return np.array(avg_precs).mean()

def mean_recall(golds, rankings, n):
    avg_precs = [recall(gold, rlist, n) for gold, rlist in zip(golds, rankings)]
    return np.array(avg_precs).mean()

def get_negative_ranks(ranks, gold):
    return [r for r in ranks if r!=gold]

def get_negative_ranks_arr(ranks, gold):
    n_ranks = [get_negative_ranks(r, g) for r,g in zip(ranks, claim_idx)]
    return np.array(n_ranks)

map_results = {}
map_recall_results = {}
save_ranks = False
for ptn in ["train", "dev"]:
    if ptn == "train":
        run_tweets, claim_idx = get_idx(train_conns, claims, tweets)
        ranks = np.array([ids[rerank] for ids, rerank in zip(neg_ids, reranks)])
    elif ptn == "dev":
        run_tweets, claim_idx = get_idx(dev_conns, claims, tweets)
        ranks = np.array([ids[rerank] for ids, rerank in zip(dev_neg_ids, dev_reranks)])
    elif ptn == "test":
        run_tweets, claim_idx = get_idx(test_conns, claims, test_tweets)

    if save_ranks:
        np.save(f"./experiments/finetune_st5_large_claims_negs/negative_embs_{ptn}.npy",
                get_negative_ranks_arr(ranks, claim_idx))
        np.save(f"./experiments/finetune_st5_large_claims_negs/ranks_{ptn}.npy",
                np.array(ranks))
    
    map_results[ptn] = []
    for n in [1,5]:
        map_results[ptn].append(mean_avg_prec(claim_idx, ranks, n))
        
    map_recall_results[ptn] = []
    for n in [1,5]:
        map_recall_results[ptn].append(mean_recall(claim_idx, ranks, n))

In [21]:
ranks[:10]

array([[ 1184,  9221, 13600,  2843,  6580],
       [ 9221,  4757,  5270,  3998,  3767],
       [ 7808, 12947,  4065,  6213,  6447],
       [ 4388,  1640,  5398, 13760,  8051],
       [ 1640, 13760,  6823,  5309,  8051],
       [11060, 10734,   874,  6134, 10195],
       [ 9380, 12609, 13229,  9611,   456],
       [ 3926, 10537,  1658,  6830,  4077],
       [ 9764,  1721,  9655,  6676,  1476],
       [  177,  1476,  9764,  9655,  3812]])

In [22]:
map_results

{'train': [0.6916916916916916, 0.8087921254587921],
 'dev': [0.44, 0.6563333333333332]}

In [23]:
map_recall_results

{'train': [0.6916916916916916, 0.9629629629629629], 'dev': [0.44, 0.975]}

In [30]:
.063 * len(dev_dl.dataset) / len(dev_dl) 

1.8

In [29]:
len(dev_dl)

7

In [17]:
ns = list(range(1,6))
sum([1/n for n in ns])

2.283333333333333

In [18]:
ns

[1, 2, 3, 4, 5]

In [20]:
mult = sum([1/n for n in ns]) / 5
vals = [.963, .975]
[mult*v for v in vals]

[0.43977, 0.44525]

In [55]:
k = 5
m = 200
A = np.stack([np.random.permutation(k) for _ in range(m)], 0)
bA = (A == 0).astype(int)
print(bA[:,0].mean())

0.21


array([2, 1, 0, 3, 4])