In [1]:
import pandas as pd
import json
import os
import numpy as np
import torch
from dynamicquery import utils

In [2]:
base_path = "/home/mshlis/Projects/RIET/DynamicQuery"
os.chdir(base_path)

In [3]:
tweets, test_tweets = utils.get_tweets()
test_tweets = test_tweets[1:]
train_conns, dev_conns, test_conns = utils.get_qrels()
claims = utils.get_claims()

In [4]:
torch.cuda.is_available()

True

In [6]:
from sentence_transformers import SentenceTransformer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_str = "sentence-transformers/sentence-t5-large"
ft_str = "./experiments/candidate_selection/finetune_st5_large_claims_negs/model.pt"
model = SentenceTransformer(model_str)
model.load_state_dict(torch.load(ft_str))

<All keys matched successfully>

In [8]:
embs = model.encode(claims.vclaim.to_list())
save_ranks = False
save_tweet_embs = True

In [9]:
def get_idx(connections, claims, tweets):
    run_tweets = tweets.join(connections.set_index("tweet_id"), on="id", how="inner")
    run_tweets = run_tweets.join(claims.set_index("vclaim_id"), on="claim_id", how="inner")
    run_tweets = run_tweets[["tweet", "vclaim"]].reset_index()
    claim_idx = [claims.vclaim.to_list().index(t_claim) for t_claim in run_tweets.vclaim.to_list()]
    return run_tweets, claim_idx

def avg_prec(gold, rankings, n):
    is_rel = (np.array(rankings)[:n] == gold).astype(float)
    return (is_rel/np.arange(1,n+1)).sum()

def recall(gold, rankings, n):
    is_rel = (np.array(rankings)[:n] == gold).astype(float)
    return is_rel.sum()

def mean_avg_prec(golds, rankings, n):
    avg_precs = [avg_prec(gold, rlist, n) for gold, rlist in zip(golds, rankings)]
    return np.array(avg_precs).mean()

def mean_recall(golds, rankings, n):
    avg_precs = [recall(gold, rlist, n) for gold, rlist in zip(golds, rankings)]
    return np.array(avg_precs).mean()

def get_negative_ranks(ranks, gold):
    return [r for r in ranks if r!=gold]

def get_negative_ranks_arr(ranks, gold):
    n_ranks = [get_negative_ranks(r, g) for r,g in zip(ranks, claim_idx)]
    return np.array(n_ranks)

all_tweet_embs = {}
map_results = {}
map_recall_results = {}
for ptn in ["train", "dev", "test"]:
    if ptn == "train":
        run_tweets, claim_idx = get_idx(train_conns, claims, tweets)
    elif ptn == "dev":
        run_tweets, claim_idx = get_idx(dev_conns, claims, tweets)
    elif ptn == "test":
        run_tweets, claim_idx = get_idx(test_conns, claims, test_tweets)

    tweet_embs = model.encode(run_tweets.tweet.to_list())
    all_tweet_embs[ptn] = tweet_embs
    scores = tweet_embs @ embs.T
    ranks = [score.argsort()[::-1] for score in scores]
    if save_ranks:
        np.save(f"./experiments/candidate_selection/finetune_st5_large_claims_negs/negative_embs_{ptn}.npy",
                get_negative_ranks_arr(ranks, claim_idx))
        np.save(f"./experiments/candidate_selection/finetune_st5_large_claims_negs/ranks_{ptn}.npy",
                np.array(ranks))
    
    map_results[ptn] = []
    for n in [1,5,10,20]:
        map_results[ptn].append(mean_avg_prec(claim_idx, ranks, n))
        
    map_recall_results[ptn] = []
    for n in [1,5,10,20]:
        map_recall_results[ptn].append(mean_recall(claim_idx, ranks, n))
        
if save_tweet_embs:
    np.save("./experiments/candidate_selection/finetune_st5_large_claims_negs/tweet_embs.npy", all_tweet_embs)

In [7]:
map_results

{'train': [0.8848848848848849,
  0.9181181181181182,
  0.9199461366128032,
  0.92073154938652],
 'dev': [0.95, 0.9616666666666667, 0.9640476190476189, 0.9643809523809523],
 'test': [0.905940594059406,
  0.9364686468646864,
  0.9378830740216878,
  0.9385550869243355]}

In [8]:
map_recall_results

{'train': [0.8848848848848849,
  0.9629629629629629,
  0.975975975975976,
  0.986986986986987],
 'dev': [0.95, 0.975, 0.99, 0.995],
 'test': [0.905940594059406,
  0.9752475247524752,
  0.9851485148514851,
  0.995049504950495]}

In [8]:
def get_negative_ranks(ranks, gold):
    return [r for r in ranks if r!=gold]

def get_negative_ranks_arr(ranks, gold):
    n_ranks = [get_negative_ranks(r, g) for r,g in zip(ranks, claim_idx)]
    return np.array(n_ranks)

In [13]:
top_k = 10
n_ranks = [get_negative_ranks(r, g) for r,g in zip(ranks, claim_idx)]

In [14]:
len(claim_idx), len(ranks)

(202, 202)

In [21]:
np.array(n_ranks).shape

(202, 13824)

In [20]:
np.save("./experiments/finetune_st5_large_claims_negs/claim_embs.npy", embs)