In [1]:
import os
os.chdir("..")

import utils
import utils
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

In [2]:
class Clef2021PretrainingDataset(TensorDataset):
    def __init__(self, 
                 encode_fn, 
                 claims,
                 claim_embeddings,
                 n_negatives=5):
        self.n_negatives = n_negatives
        self.claim_embeddings = claim_embeddings
        self.claims = claims
        self.claims["encoded_vclaim"] = self.claims.vclaim.apply(encode_fn)
        
    def __len__(self):
        return len(self.claims)
    
    def __getitem__(self, index):
        negative_idx = np.random.randint(len(self.claim_embeddings), size=(self.n_negatives,))
        negative_embs = self.claim_embeddings[negative_idx]
        positive_emb = self.claim_embeddings[index]
        embs = np.concatenate([np.expand_dims(positive_emb, 0), negative_embs], 0)

        positive_tokens = self.claims["encoded_vclaim"][index]
        positive_inpt = (np.array(positive_tokens["input_ids"]), np.array(positive_tokens["attention_mask"]))

        return (positive_inpt, embs)
    
    
def get_clef2021_pretraining_dataloader(encode_fn, 
                            claims, 
                            claim_embeddings,
                            n_negatives=5,
                            params={'batch_size':32, 'shuffle':True}):
    dataset = Clef2021PretrainingDataset(encode_fn, 
                              claims, 
                              claim_embeddings,
                              n_negatives=n_negatives)
    # return dataset
    return DataLoader(dataset, **params)        

In [3]:
base_path = "/home/mshlis/Projects/RIET/DynamicQuery"
exp_path = os.path.join(base_path, "experiments/finetune_st5_large_claims_negs")
train_neg_path = os.path.join(exp_path, "negative_embs_train.npy")
dev_neg_path = os.path.join(exp_path, "negative_embs_dev.npy")
emb_path = os.path.join(exp_path, "claim_embs.npy")

In [4]:
import extended_roberta as roberta
from transformers import AutoTokenizer
from functools import partial
import importlib
importlib.reload(roberta)

MAX_LENGTH = 192

model_str = "roberta-base"
model = roberta.ExtendedRobertaForExternalClassification.from_pretrained(model_str)
tokenizer = AutoTokenizer.from_pretrained(model_str)
tokenize = partial(tokenizer, **dict(
    truncation=True, 
    max_length=MAX_LENGTH, 
    padding="max_length", 
    return_attention_mask=True
))

Some weights of the model checkpoint at roberta-base were not used when initializing ExtendedRobertaForExternalClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ExtendedRobertaForExternalClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ExtendedRobertaForExternalClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.adapter_layer.1.weight', 'roberta.encoder.adapte

In [5]:
neg_ids = np.load(train_neg_path)
dev_neg_ids = np.load(dev_neg_path)
neg_embs = np.load(emb_path)

In [6]:
neg_ids.shape, neg_embs.shape

((999, 13824), (13825, 768))

In [7]:
os.chdir(base_path)

# Claim Data
tweets, test_tweets = utils.get_tweets()
test_tweets = test_tweets[1:]
train_conns, dev_conns, test_conns = utils.get_qrels()
claims = utils.get_claims()

In [8]:
BATCH_SIZE = 32

train_dl = get_clef2021_pretraining_dataloader(
    tokenize, 
    claims, 
    neg_embs,
    n_negatives=5,
    # neg_ids[:,:5],
    params={'batch_size':BATCH_SIZE, 'shuffle':True})

dev_dl = get_clef2021_pretraining_dataloader(
    tokenize, 
    claims, 
    neg_embs,
    n_negatives=5,
    # dev_neg_ids[:,:5],
    params={'batch_size':BATCH_SIZE, 'shuffle':False}) 

In [9]:
import torch.optim as optim
import train
importlib.reload(train)

<module 'train' from '/home/mshlis/Projects/RIET/DynamicQuery/src/dynamicquery/cross_query/train.py'>

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
print(device)

cuda:0


In [12]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [13]:
EPOCHS = 15
train.train(
    model, 
    optimizer, 
    device,
    train_dl,
    dev_dl,
    epochs=EPOCHS,
    print_steps=5,
    adapters_only=True, 
    cls_train=True,
    save_path="./pretraining_saved_model_adaps.pt"
)

TRAIN [1,     5] loss: 1.799
TRAIN [1,    10] loss: 1.803
TRAIN [1,    15] loss: 1.799
TRAIN [1,    20] loss: 1.813
TRAIN [1,    25] loss: 1.798
TRAIN [1,    30] loss: 1.791
TRAIN [1,    35] loss: 1.796
TRAIN [1,    40] loss: 1.795
TRAIN [1,    45] loss: 1.788
TRAIN [1,    50] loss: 1.783
TRAIN [1,    55] loss: 1.794
TRAIN [1,    60] loss: 1.797
TRAIN [1,    65] loss: 1.806
TRAIN [1,    70] loss: 1.787
TRAIN [1,    75] loss: 1.792
TRAIN [1,    80] loss: 1.801
TRAIN [1,    85] loss: 1.788
TRAIN [1,    90] loss: 1.809
TRAIN [1,    95] loss: 1.793
TRAIN [1,   100] loss: 1.784
TRAIN [1,   105] loss: 1.785
TRAIN [1,   110] loss: 1.796
TRAIN [1,   115] loss: 1.795
TRAIN [1,   120] loss: 1.791
TRAIN [1,   125] loss: 1.792
TRAIN [1,   130] loss: 1.785
TRAIN [1,   135] loss: 1.788
TRAIN [1,   140] loss: 1.786
TRAIN [1,   145] loss: 1.798
TRAIN [1,   150] loss: 1.808
TRAIN [1,   155] loss: 1.793
TRAIN [1,   160] loss: 1.797
TRAIN [1,   165] loss: 1.806
TRAIN [1,   170] loss: 1.791
TRAIN [1,   17

KeyboardInterrupt: 

In [14]:
torch.save(model.state_dict(), os.path.join(base_path, "src/dynamicquery/cross_query/pretrain_model_10ep.pt"))

In [13]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [14]:
EPOCHS = 20
train.train(
    model, 
    optimizer, 
    device,
    train_dl,
    dev_dl,
    epochs=EPOCHS+1,
    print_steps=5,
    adapters_only=False, 
    cls_train=True,
    save_path="./cross_query/test_saved_model_5.pt"
)

TRAIN [1,     5] loss: 1.621
TRAIN [1,    10] loss: 1.728
TRAIN [1,    15] loss: 1.715
TRAIN [1,    20] loss: 1.714
TRAIN [1,    25] loss: 1.688
TRAIN [1,    30] loss: 1.617
DEV [1,     7] loss: 1.831
TRAIN [2,     5] loss: 1.549
TRAIN [2,    10] loss: 1.360
TRAIN [2,    15] loss: 1.369
TRAIN [2,    20] loss: 1.268
TRAIN [2,    25] loss: 1.354
TRAIN [2,    30] loss: 1.333
DEV [2,     7] loss: 2.165
TRAIN [3,     5] loss: 1.127
TRAIN [3,    10] loss: 1.445
TRAIN [3,    15] loss: 1.388
TRAIN [3,    20] loss: 1.381
TRAIN [3,    25] loss: 1.115
TRAIN [3,    30] loss: 1.270
DEV [3,     7] loss: 1.899
TRAIN [4,     5] loss: 1.196
TRAIN [4,    10] loss: 1.154
TRAIN [4,    15] loss: 1.164
TRAIN [4,    20] loss: 1.131
TRAIN [4,    25] loss: 1.118
TRAIN [4,    30] loss: 1.100
DEV [4,     7] loss: 1.844
TRAIN [5,     5] loss: 1.384
TRAIN [5,    10] loss: 1.442
TRAIN [5,    15] loss: 1.341
TRAIN [5,    20] loss: 1.253
TRAIN [5,    25] loss: 1.075
TRAIN [5,    30] loss: 1.093
DEV [5,     7] loss: 2

In [16]:
torch.save(model.state_dict(), "./cross_query/test_saved_model_5.pt")

In [20]:
x1 = torch.zeros((2,2))
x2 = torch.ones((2,2))

torch.save(x1, "temp-delete.pt")

In [21]:
x = torch.load("temp-delete.pt")
x

tensor([[0., 0.],
        [0., 0.]])

In [22]:
torch.save(x2, "temp-delete.pt")
x = torch.load("temp-delete.pt")
x

tensor([[1., 1.],
        [1., 1.]])

In [22]:
list(filter(lambda u: "mode" in u.lower(), dir(model)))

['_load_state_dict_into_model',
 '_load_state_dict_into_model_low_mem',
 '_prepare_model_inputs',
 '_update_model_kwargs_for_generation',
 'base_model',
 'base_model_prefix']