In [2]:
import os
os.chdir("..")

import utils
import utils
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

In [3]:
class Clef2021RerankedDataset(TensorDataset):
    def __init__(self, 
                 encode_fn, 
                 claims, 
                 tweets, 
                 connections,
                 claim_embeddings,
                 ranks):
        self.claim_embeddings = claim_embeddings
        self.ranks = ranks
        run_tweets = tweets.join(connections.set_index("tweet_id"), on="id", how="inner")
        run_tweets = run_tweets.join(claims.set_index("vclaim_id"), on="claim_id", how="inner")
        run_tweets = run_tweets[["tweet", "vclaim"]].reset_index()
        run_tweets["encoded_tweet"] = run_tweets.tweet.apply(encode_fn)
        self.claim_idx = [claims.vclaim.to_list().index(t_claim) for t_claim in run_tweets.vclaim.to_list()]
        self.data = run_tweets
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        Xt = self.data.encoded_tweet[index]
        Xt = (np.array(Xt["input_ids"]), np.array(Xt["attention_mask"]))
        Xe = self.claim_embeddings[self.ranks[index]]
        Ye = self.claim_embeddings[self.claim_idx[index:index+1]]
        return (Xt, np.concatenate([Ye, Xe], axis=0))
    
    
def get_clef2021_reranked_dataloader(encode_fn, 
                            claims, 
                            tweets, 
                            connections, 
                            claim_embeddings,
                            ranks,
                            params={'batch_size':32, 'shuffle':True}):
    dataset = Clef2021RerankedDataset(encode_fn, 
                              claims, 
                              tweets, 
                              connections, 
                              claim_embeddings,
                              ranks)
    return DataLoader(dataset, **params)        

In [4]:
exp_path = "./experiments/finetune_st5_large_claims_negs"
train_neg_path = os.path.join(exp_path, "negative_embs_train.npy")
dev_neg_path = os.path.join(exp_path, "negative_embs_dev.npy")
emb_path = os.path.join(exp_path, "claim_embs.npy")

In [5]:
import extended_roberta_v2 as roberta
from transformers import AutoTokenizer
from functools import partial
import importlib
importlib.reload(roberta)

MAX_LENGTH = 192

model_str = "roberta-base"
model = roberta.ExtendedRobertaForExternalClassification.from_pretrained(model_str)
tokenizer = AutoTokenizer.from_pretrained(model_str)
tokenize = partial(tokenizer, **dict(
    truncation=True, 
    max_length=MAX_LENGTH, 
    padding="max_length", 
    return_attention_mask=True
))

Some weights of the model checkpoint at roberta-base were not used when initializing ExtendedRobertaForExternalClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ExtendedRobertaForExternalClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.adapter_layer.4.bias', 'roberta.encoder.adapter_

In [7]:
neg_ids = np.load(train_neg_path)
dev_neg_ids = np.load(dev_neg_path)
neg_embs = np.load(emb_path)

FileNotFoundError: [Errno 2] No such file or directory: './experiments/finetune_st5_large_claims_negs/negative_embs_train.npy'

In [1]:
neg_ids.shape, neg_embs.shape

NameError: name 'neg_ids' is not defined

In [7]:
# Claim Data
tweets, test_tweets = utils.get_tweets()
test_tweets = test_tweets[1:]
train_conns, dev_conns, test_conns = utils.get_qrels()
claims = utils.get_claims()

In [10]:
BATCH_SIZE = 32

train_dl = get_clef2021_reranked_dataloader(
    tokenize, 
    claims, 
    tweets, 
    train_conns,
    neg_embs,
    neg_ids[:,:5],
    params={'batch_size':BATCH_SIZE, 'shuffle':True})

dev_dl = get_clef2021_reranked_dataloader(
    tokenize, 
    claims, 
    tweets, 
    dev_conns,
    neg_embs,
    dev_neg_ids[:,:5],
    params={'batch_size':BATCH_SIZE, 'shuffle':False}) 

In [16]:
for inputs, external_inputs in train_dl:
    inpt_dict = {
        "input_ids": inputs[0],
        "attention_mask": torch.cat([torch.ones((
            external_inputs.shape[0],
            external_inputs.shape[1]
        )).long(), inputs[1]], dim=1),
        "extended_states": external_inputs
    } 
    res = model(**inpt_dict)
    break

torch.Size([32, 1, 1, 198])


In [18]:
res.loss

tensor(1.7920, grad_fn=<NllLossBackward0>)

In [11]:
import torch.optim as optim
import train
importlib.reload(train)

ModuleNotFoundError: No module named 'train'

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [12]:
EPOCHS = 15
train.train(
    model, 
    optimizer, 
    device,
    train_dl,
    dev_dl,
    epochs=EPOCHS,
    print_steps=5,
    adapters_only=True, 
    cls_train=True,
    save_path="./cross_query/test_saved_model_adaps_5.pt"
)

TRAIN [1,     5] loss: 1.792
TRAIN [1,    10] loss: 1.792
TRAIN [1,    15] loss: 1.791
TRAIN [1,    20] loss: 1.791
TRAIN [1,    25] loss: 1.791
TRAIN [1,    30] loss: 1.791
DEV [1,     7] loss: 1.791
TRAIN [2,     5] loss: 1.788
TRAIN [2,    10] loss: 1.788
TRAIN [2,    15] loss: 1.787
TRAIN [2,    20] loss: 1.785
TRAIN [2,    25] loss: 1.785
TRAIN [2,    30] loss: 1.784
DEV [2,     7] loss: 1.789
TRAIN [3,     5] loss: 1.778
TRAIN [3,    10] loss: 1.773
TRAIN [3,    15] loss: 1.769
TRAIN [3,    20] loss: 1.759
TRAIN [3,    25] loss: 1.751
TRAIN [3,    30] loss: 1.746
DEV [3,     7] loss: 1.785
TRAIN [4,     5] loss: 1.716
TRAIN [4,    10] loss: 1.687
TRAIN [4,    15] loss: 1.673
TRAIN [4,    20] loss: 1.683
TRAIN [4,    25] loss: 1.707
TRAIN [4,    30] loss: 1.673
DEV [4,     7] loss: 1.821
TRAIN [5,     5] loss: 1.605
TRAIN [5,    10] loss: 1.603
TRAIN [5,    15] loss: 1.582
TRAIN [5,    20] loss: 1.573
TRAIN [5,    25] loss: 1.631
TRAIN [5,    30] loss: 1.621
DEV [5,     7] loss: 1

In [13]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [14]:
EPOCHS = 20
train.train(
    model, 
    optimizer, 
    device,
    train_dl,
    dev_dl,
    epochs=EPOCHS+1,
    print_steps=5,
    adapters_only=False, 
    cls_train=True,
    save_path="./cross_query/test_saved_model_5.pt"
)

TRAIN [1,     5] loss: 1.621
TRAIN [1,    10] loss: 1.728
TRAIN [1,    15] loss: 1.715
TRAIN [1,    20] loss: 1.714
TRAIN [1,    25] loss: 1.688
TRAIN [1,    30] loss: 1.617
DEV [1,     7] loss: 1.831
TRAIN [2,     5] loss: 1.549
TRAIN [2,    10] loss: 1.360
TRAIN [2,    15] loss: 1.369
TRAIN [2,    20] loss: 1.268
TRAIN [2,    25] loss: 1.354
TRAIN [2,    30] loss: 1.333
DEV [2,     7] loss: 2.165
TRAIN [3,     5] loss: 1.127
TRAIN [3,    10] loss: 1.445
TRAIN [3,    15] loss: 1.388
TRAIN [3,    20] loss: 1.381
TRAIN [3,    25] loss: 1.115
TRAIN [3,    30] loss: 1.270
DEV [3,     7] loss: 1.899
TRAIN [4,     5] loss: 1.196
TRAIN [4,    10] loss: 1.154
TRAIN [4,    15] loss: 1.164
TRAIN [4,    20] loss: 1.131
TRAIN [4,    25] loss: 1.118
TRAIN [4,    30] loss: 1.100
DEV [4,     7] loss: 1.844
TRAIN [5,     5] loss: 1.384
TRAIN [5,    10] loss: 1.442
TRAIN [5,    15] loss: 1.341
TRAIN [5,    20] loss: 1.253
TRAIN [5,    25] loss: 1.075
TRAIN [5,    30] loss: 1.093
DEV [5,     7] loss: 2

In [16]:
torch.save(model.state_dict(), "./cross_query/test_saved_model_5.pt")

In [20]:
x1 = torch.zeros((2,2))
x2 = torch.ones((2,2))

torch.save(x1, "temp-delete.pt")

In [21]:
x = torch.load("temp-delete.pt")
x

tensor([[0., 0.],
        [0., 0.]])

In [22]:
torch.save(x2, "temp-delete.pt")
x = torch.load("temp-delete.pt")
x

tensor([[1., 1.],
        [1., 1.]])

In [22]:
list(filter(lambda u: "mode" in u.lower(), dir(model)))

['_load_state_dict_into_model',
 '_load_state_dict_into_model_low_mem',
 '_prepare_model_inputs',
 '_update_model_kwargs_for_generation',
 'base_model',
 'base_model_prefix']