In [5]:
import torch
from pytorch_transformers import *
import numpy as np
# import pandas as pd
from tqdm import tqdm, trange
from torch import nn
import random
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import run_squad as rs
# from utils_squad import (read_squad_examples, convert_examples_to_features,
#                          RawResult, write_predictions,
#                          RawResultExtended, write_predictions_extended)
import utils_squad as us


In [6]:
import json
import logging
import math
import collections
from io import open

from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores

logger = logging.getLogger(__name__)

In [7]:
MODELS = [(BertForQuestionAnswering,       BertTokenizer,      'bert-base-uncased')]
# MODELS = [(BertForQuestionAnswering,       BertTokenizer,      'bert-large-uncased-whole-word-masking-finetuned-squad')]

In [8]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)
    config = BertConfig.from_pretrained(pretrained_weights)


In [None]:
class RL_optimizer(BertForQuestionAnswering):
    def __init__(self, config):
        super(RL_optimizer, self).__init__(config)
        self.reward = 0
        self.bertqa = BertForQuestionAnswering(config)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                end_positions=None, position_ids=None, head_mask=None):
        
        outputs = self.bertqa(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                end_positions=None, position_ids=None, head_mask=None)

        return outputs

In [None]:
rl = RL_optimizer(config)

In [9]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=0.9)



In [10]:
def calc_rewards():
    """For now, we're using random rewards to check if the flow works fine"""
    return random.choice([-0.5, 0.5])

In [11]:
class args_list:
    def __init__(self):
        self.local_rank = -1
        self.n_gpu = 1
        self.train_file = '/home/ubuntu/question_generation/data/train-v2.0.json'
        self.evaluate = 0
        self.predict_file = '/home/ubuntu/question_generation/data/dev-v2.0.json'
        self.eval_batch_size = 1
        self.model_type = 'bert'
        self.model_name_or_path = 'bert-base-uncased'
        self.output_dir = './outputs/'
        self.tokenizer_name = 'BertTokenizer'
        self.max_seq_length = 384
        self.version_2_with_negative = True
        self.doc_stride = 128
        self.max_query_length = 64
        self.device = torch.device('cuda', 0)
        self.overwrite_cache = False
        self.null_score_diff_threshold = 0.0
        self.n_best_size = 20
        self.max_answer_length = 30
        self.verbose_logging = False
        self.do_lower_case = True
        

In [12]:
args = args_list()
args.device

device(type='cuda', index=0)

In [13]:
# torch.cuda.set_device(args.device)
model.to(args.device)
# rl.to(args.device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
        

In [14]:
## For Train
# dataset = rs.load_and_cache_examples(args, tokenizer)
dataset, examples, features = rs.load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=True)
train_sampler = SequentialSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=8)

KeyboardInterrupt: 

In [None]:
## For Eval
dataset, examples, features = rs.load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

In [31]:
def load_dataset():
    ## For Train
    # dataset = rs.load_and_cache_examples(args, tokenizer)
    dataset, examples, features = rs.load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=True)
    train_sampler = SequentialSampler(dataset)
    train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=2)
    return dataset, examples, features, train_sampler, train_dataloader

In [41]:
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=4)

In [32]:
dataset, examples, features, train_sampler, train_dataloader = load_dataset()

KeyboardInterrupt: 

In [42]:
def train_with_rewards():
    
    train_iterator = trange(int(2), desc="Epoch", disable=-1 not in [-1, 0])
    model.zero_grad()
    
    for tr_iter in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            torch.cuda.empty_cache()
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
#             break
            outputs = model(**form_inputs(batch))
            loss = outputs[0]
            
            results, used_features = get_all_results(batch, outputs)
            used_examples = get_required_examples(used_features, examples)
            all_predictions = write_predictions(used_examples, used_features, results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)
            
            rewards = calc_rewards()
            loss = loss + rewards
#             print('reward = ', rewards)
            model.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'The loss in step ', tr_iter, ' is ', loss)

In [40]:
train_with_rewards()















Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 0/65972 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 1/65972 [00:00<9:08:52,  2.00it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 2/65972 [00:01<9:15:16,  1.98it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 3/65972 [00:01<9:19:08,  1.97it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 4/65972 [00:02<9:21:31,  1.96it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 5/65972 [00:02<9:23:54,  1.95it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 6/65972 [00:03<9:26:16,  1.94it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 7/65972 [00:

Iteration:   0%|          | 65/65972 [00:33<9:33:44,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 66/65972 [00:34<9:34:34,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 67/65972 [00:34<9:33:14,  1.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 68/65972 [00:35<9:33:57,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 69/65972 [00:35<9:35:26,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 70/65972 [00:36<9:35:22,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 71/65972 [00:36<9:33:06,  1.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 72/65972 [00:37<9:33:20,  1.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|

Iteration:   0%|          | 131/65972 [01:08<9:36:17,  1.90it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 132/65972 [01:08<9:35:39,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 133/65972 [01:09<9:35:53,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 134/65972 [01:09<9:36:25,  1.90it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 135/65972 [01:10<9:35:14,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 136/65972 [01:10<9:35:33,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 137/65972 [01:11<9:34:16,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteration:   0%|          | 138/65972 [01:12<9:33:10,  1.91it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














Iteratio

KeyboardInterrupt: 

In [None]:
def get_naive_rewards(all_predictions):
    

In [None]:
for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
#             print(batch)
            batch = tuple(t.to(args.device) for t in batch)
#             print(batch)
            break

In [None]:
batch

In [20]:
def form_inputs(batch):
    inputs = {}
    inputs = {'input_ids': batch[0],
                      'attention_mask':  batch[1], 
                      'token_type_ids':  None if args.model_type == 'xlm' else batch[2],  
                      'start_positions': batch[3], 
                      'end_positions':   batch[4]}
    return inputs

In [None]:
outputs = model(**form_inputs(batch))

In [None]:
outputs

In [21]:
def get_all_results(batch, outputs):
    example_indices = batch[3]
    all_results = []
    used_features = []
    for i, example_index in enumerate(example_indices):
#                 print(example_index)
                eval_feature = features[example_index.item()]
                used_features.append(eval_feature)
                unique_id = int(eval_feature.unique_id)
                if args.model_type in ['xlnet', 'xlm']:
                    # XLNet uses a more complex post-processing procedure
                    result = us.RawResultExtended(unique_id            = unique_id,
                                               start_top_log_probs  = outputs[0][i].tolist(),
                                               start_top_index      = (outputs[1][i]).tolist(),
                                               end_top_log_probs    = (outputs[2][i]).tolist(),
                                               end_top_index        = (outputs[3][i]).tolist(),
                                               cls_logits           = (outputs[4][i]).tolist())
                else:
                    result = us.RawResult(unique_id    = unique_id,
                                       start_logits = outputs[1][i].tolist(),
                                       end_logits   = outputs[2][i].tolist())
#                     print(unique_id)
                all_results.append(result)
    return all_results, used_features

In [None]:
results, used_features = get_all_results(batch)

In [22]:
def get_required_examples(used_features, all_examples):
    used_examples = []
    for f in used_features:
        used_examples.append(all_examples[f.example_index])
    return used_examples

In [None]:
used_examples = get_required_examples(used_features, examples)

In [37]:
import os
prefix = ''
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
else:
        output_null_log_odds_file = None

In [None]:
all_predictions = write_predictions(used_examples, used_features, results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

In [None]:
model.zero_grad()
loss = outputs[0]
r = calc_rewards()
print(r)
loss = loss + r
print(loss)
loss.backward()
update_weights()

In [25]:
def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, verbose_logging,
                      version_2_with_negative, null_score_diff_threshold):
    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
    logger.info("Writing nbest to: %s" % (output_nbest_file))

    example_index_to_features = collections.defaultdict(list)
    count_f = 0
    for feature in all_features:
        example_index_to_features[count_f].append(feature)
        count_f += 1

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    scores_diff_json = collections.OrderedDict()

    for (example_index, example) in enumerate(all_examples):
        features = example_index_to_features[example_index]

        prelim_predictions = []
        # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min null score
        null_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            
#             if feature.unique_id == 1000000075:
                
            result = unique_id_to_result[feature.unique_id]
            start_indexes = us._get_best_indexes(result.start_logits, n_best_size)
            end_indexes = us._get_best_indexes(result.end_logits, n_best_size)
            # if we could have irrelevant answers, get the min score of irrelevant
            if version_2_with_negative:
                feature_null_score = result.start_logits[0] + result.end_logits[0]
                if feature_null_score < score_null:
                    score_null = feature_null_score
                    min_null_feature_index = feature_index
                    null_start_logit = result.start_logits[0]
                    null_end_logit = result.end_logits[0]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # We could hypothetically create invalid predictions, e.g., predict
                    # that the start of the span is in the question. We throw out all
                    # invalid predictions.
                    if start_index >= len(feature.tokens):
                        continue
                    if end_index >= len(feature.tokens):
                        continue
                    if start_index not in feature.token_to_orig_map:
                        continue
                    if end_index not in feature.token_to_orig_map:
                        continue
                    if not feature.token_is_max_context.get(start_index, False):
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))
        if version_2_with_negative:
            prelim_predictions.append(
                _PrelimPrediction(
                    feature_index=min_null_feature_index,
                    start_index=0,
                    end_index=0,
                    start_logit=null_start_logit,
                    end_logit=null_end_logit))
        prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_logit + x.end_logit),
            reverse=True)

        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "NbestPrediction", ["text", "start_logit", "end_logit"])

        seen_predictions = {}
        nbest = []
        for pred in prelim_predictions:
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
                tok_text = " ".join(tok_tokens)

                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = us.get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True
            else:
                final_text = ""
                seen_predictions[final_text] = True

            nbest.append(
                _NbestPrediction(
                    text=final_text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit))
        # if we didn't include the empty option in the n-best, include it
        if version_2_with_negative:
            if "" not in seen_predictions:
                nbest.append(
                    _NbestPrediction(
                        text="",
                        start_logit=null_start_logit,
                        end_logit=null_end_logit))
                
            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
            if len(nbest)==1:
                nbest.insert(0,
                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
            nbest.append(
                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        assert len(nbest) >= 1

        total_scores = []
        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
            if not best_non_null_entry:
                if entry.text:
                    best_non_null_entry = entry

        probs = us._compute_softmax(total_scores)

        nbest_json = []
        for (i, entry) in enumerate(nbest):
            output = collections.OrderedDict()
            output["text"] = entry.text
            output["probability"] = probs[i]
            output["start_logit"] = entry.start_logit
            output["end_logit"] = entry.end_logit
            nbest_json.append(output)

        assert len(nbest_json) >= 1

        if not version_2_with_negative:
            all_predictions[example.qas_id] = nbest_json[0]["text"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
            score_diff = score_null - best_non_null_entry.start_logit - (
                best_non_null_entry.end_logit)
            scores_diff_json[example.qas_id] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example.qas_id] = ""
            else:
                all_predictions[example.qas_id] = best_non_null_entry.text
        all_nbest_json[example.qas_id] = nbest_json

    with open(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")

    with open(output_nbest_file, "w") as writer:
        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

    if version_2_with_negative:
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    return all_predictions

In [None]:
!ls outputs

In [None]:
!mkdir outputs

In [None]:
used_examples

In [None]:
list(all_predictions.items())

In [None]:
tokenizer.decode(batch[0][0].flatten().tolist())

In [None]:
used_features

In [None]:
f = used_features[0]

In [None]:
f.example_index

In [18]:
def get_required_examples(used_features, all_examples):
    used_examples = []
    for f in used_features:
        used_examples.append(all_examples[f.example_index])
    return used_examples

In [None]:
example_index_to_features = collections.defaultdict(list)
c = 0
for f in used_features:
        example_index_to_features[c].append(f)
        c += 1

In [None]:
example_index_to_features

In [None]:
for (example_index, example) in enumerate(used_examples):
        final_f = example_index_to_features[example_index]
        print(final_f)

In [4]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(os.environ['CUDA_VISIBLE_DEVICES'])

0


In [46]:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
small_data = np.array()
for step, batch in enumerate(epoch_iterator):
    if step < 10:
        small_data.append(batch)
    else:
        break



















Iteration:   0%|          | 0/32986 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

In [48]:
len(small_data)

10

In [50]:
type(dataset)

torch.utils.data.dataset.TensorDataset

In [58]:
a = torch.tensor(small_data[0])

ValueError: only one element tensors can be converted to Python scalars

In [59]:
small_data[0]

[tensor([[ 101, 2043, 2106,  ...,    0,    0,    0],
         [ 101, 2054, 2752,  ...,    0,    0,    0],
         [ 101, 2043, 2106,  ...,    0,    0,    0],
         [ 101, 1999, 2054,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([ 75,  68, 143,  58]),
 tensor([ 78,  70, 143,  60]),
 tensor([0, 0, 0, 0]),
 tensor([[0., 1., 1.,  ..., 1., 1., 1.],
         [0., 1., 1.,  ..., 1., 1., 1.],
         [0., 1., 1.,  ..., 1., 1., 1.],
         [0., 1., 1.,  ..., 1., 1., 1.]])]