In [0]:
import torch, os

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
torch.cuda.empty_cache()

In [0]:
PATH = '/content/gdrive/My Drive/Colab Notebooks/NLP/CommonsenseQA'
import sys
sys.path.append(PATH)

In [0]:
model = torch.load(os.path.join(PATH, 'models', 'gpt-20190227015123.pt'))

In [0]:
model_state_dict = model['model_state_dict']

In [0]:
from finetuning.gpt import *
parser = init_parser(PATH)
args = parser.parse_args('')

In [0]:
files = ['train_rand_split.jsonl', 'dev_rand_split.jsonl', 'test_rand_split_no_answers.jsonl']
files = [os.path.join(PATH,'data', i) for i in files]

In [23]:
n_ctx = args.n_ctx  # max sentence length

encoder_path = os.path.join(PATH, 'gpt/model/encoder_bpe_40000.json')
bpe_path = os.path.join(PATH, 'gpt/model/vocab_40000.bpe')

text_encoder = TextEncoder(encoder_path, bpe_path)
encoder = text_encoder.encoder
n_vocab = len(text_encoder.encoder)

# assign new tokens
encoder['_start_'] = len(encoder)
encoder['_delimiter_'] = len(encoder)
encoder['_classify_'] = len(encoder)
clf_token = encoder['_classify_']

n_special = 3
max_len = n_ctx // 2 - 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device", device, "n_gpu", n_gpu)

data_sets = [read_data(f) for f in files]

((trQ, trX1, trX2, trX3, trY),
 (vaQ, vaX1, vaX2, vaX3, vaY),
 (teQ, teX1, teX2, teX3)) = encode_dataset(*data_sets, encoder=text_encoder)

n_ctx = min(max(
    [len(q[:max_len]) + max(len(x1[:max_len]),
                            len(x2[:max_len]),
                            len(x3[:max_len])) for q, x1, x2, x3 in zip(trQ, trX1, trX2, trX3)]
    + [len(q[:max_len]) + max(len(x1[:max_len]),
                              len(x2[:max_len]),
                              len(x3[:max_len])) for q, x1, x2, x3 in zip(vaQ, vaX1, vaX2, vaX3)]
    + [len(q[:max_len]) + max(len(x1[:max_len]),
                              len(x2[:max_len]),
                              len(x3[:max_len])) for q, x1, x2, x3 in zip(teQ, teX1, teX2, teX3)]
) + 3, n_ctx)

vocab = n_vocab + n_special + n_ctx

def transform_qa(Q, X1, X2, X3):
    n_batch = len(Q)
    xmb = np.zeros((n_batch, 3, n_ctx, 2), dtype=np.int32)
    mmb = np.zeros((n_batch, 3, n_ctx), dtype=np.float32)
    start = encoder['_start_']
    delimiter = encoder['_delimiter_']
    for i, (q, x1, x2, x3), in enumerate(zip(Q, X1, X2, X3)):
        x11 = [start] + q[:max_len] + [delimiter] + x1[:max_len] + [clf_token]
        x12 = [start] + q[:max_len] + [delimiter] + x2[:max_len] + [clf_token]
        x13 = [start] + q[:max_len] + [delimiter] + x3[:max_len] + [clf_token]
        l11 = len(x11)
        l12 = len(x12)
        l13 = len(x13)
        xmb[i, 0, :l11, 0] = x11
        xmb[i, 1, :l12, 0] = x12
        xmb[i, 2, :l13, 0] = x13
        mmb[i, 0, :l11] = 1
        mmb[i, 1, :l12] = 1
        mmb[i, 2, :l13] = 1
    # Position information that is added to the input embeddings in the TransformerModel
    xmb[:, :, :, 1] = np.arange(n_vocab + n_special, n_vocab + n_special + n_ctx)
    return xmb, mmb

trX, trM = transform_qa(trQ, trX1, trX2, trX3)
vaX, vaM = transform_qa(vaQ, vaX1, vaX2, vaX3)

n_train = len(trY)
n_valid = len(vaY)
n_batch_train = 8
n_updates_total = (n_train // n_batch_train) * args.n_iter

dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx)

criterion = nn.CrossEntropyLoss(reduce=False)
model_opt = OpenAIAdam(dh_model.parameters(),
                       lr=args.lr,
                       schedule=args.lr_schedule,
                       warmup=args.lr_warmup,
                       t_total=n_updates_total,
                       b1=args.b1,
                       b2=args.b2,
                       e=args.e,
                       l2=args.l2,
                       vector_l2=args.vector_l2,
                       max_grad_norm=args.max_grad_norm)

compute_loss_fct = MultipleChoiceLossCompute(criterion,
                                             criterion,
                                             args.lm_coef,
                                             model_opt)

# load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special,
#                              path=os.path.join(PATH, 'gpt/model/'), path_names=os.path.join(PATH, 'gpt/'))


dh_model.to(device)
dh_model = nn.DataParallel(dh_model)
dh_model.load_state_dict(model['model_state_dict'])

  0%|                                                  | 0/7610 [00:00<?, ?it/s]

device cuda n_gpu 1




In [0]:
from torch.nn import CrossEntropyLoss

In [0]:
def iter_apply(Xs, Ms, Ys):
    logits = []
    cost = 0
    with torch.no_grad():
        dh_model.eval()
        for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
            n = len(xmb)
            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
            YMB = torch.tensor(ymb, dtype=torch.long).to(device)
            MMB = torch.tensor(mmb).to(device)
            _, clf_logits = dh_model(XMB)
            clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
            logits.append(clf_logits.to("cpu").numpy())
            cost += clf_losses.sum().item()
        logits = np.concatenate(logits, 0)
    return logits, cost


In [0]:
from sklearn.metrics import log_loss

In [57]:
va_logits, va_cost = iter_apply(vaX, vaM, vaY)
va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
tr_logits, tr_cost = iter_apply(trX, trM, trY)
tr_acc = accuracy_score(trY, np.argmax(tr_logits, 1)) * 100.





  0%|                                                   | 0/118 [00:00<?, ?it/s][A[A[A[A



  1%|▎                                          | 1/118 [00:00<00:41,  2.82it/s][A[A[A[A



  2%|▋                                          | 2/118 [00:00<00:39,  2.91it/s][A[A[A[A



  3%|█                                          | 3/118 [00:00<00:38,  2.99it/s][A[A[A[A



  3%|█▍                                         | 4/118 [00:01<00:37,  3.02it/s][A[A[A[A



  4%|█▊                                         | 5/118 [00:01<00:37,  3.03it/s][A[A[A[A



  5%|██▏                                        | 6/118 [00:01<00:37,  3.03it/s][A[A[A[A



  6%|██▌                                        | 7/118 [00:02<00:36,  3.05it/s][A[A[A[A



  7%|██▉                                        | 8/118 [00:02<00:35,  3.08it/s][A[A[A[A



  8%|███▎                                       | 9/118 [00:02<00:35,  3.09it/s][A[A[A[A



  8%|███▌                           

In [59]:
print(tr_acc, va_acc)
print(log_loss(trY, tr_logits), log_loss(vaY, va_logits))

96.25492772667543 57.47368421052632
0.49233503769495757 4.896577179484363
