In [6]:
from transformers import default_data_collator
from experiments import constants, misc_utils
from transformers import TrainingArguments
from transformers import InputFeatures
import transformers
import torch
from tqdm import tqdm
from transformers.data.datasets.glue import FileLock


tokenizer, model = misc_utils.create_tokenizer_and_model(
        constants.MNLI_MODEL_PATH)

(mnli_train_dataset,
 mnli_eval_dataset) = misc_utils.create_datasets(
    task_name="mnli",
    tokenizer=tokenizer)

train_instance_data_loader = misc_utils.get_dataloader(
        dataset=mnli_train_dataset,
        batch_size=1,
        random=False)

def build_compute_metrics_fn(task_name: str):
        def compute_metrics_fn(p):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

trainer = transformers.Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./tmp-output",
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        learning_rate=5e-5,
        logging_steps=100),
    data_collator=default_data_collator,
    train_dataset=mnli_train_dataset,
    eval_dataset=mnli_eval_dataset,
    compute_metrics=build_compute_metrics_fn("mnli"),
)

mnli_predicted_label = []
with tqdm(total=len(mnli_train_dataset)) as pbar:    
    for train_index, train_inputs in enumerate(train_instance_data_loader):
            preds, label_ids, step_eval_loss = misc_utils.predict(trainer, model, train_inputs)
            predLabel = int(preds.argmax(axis=-1)[0])
            #print(predLabel)
            tmp = InputFeatures(train_inputs["input_ids"][0].tolist(),
                      train_inputs["attention_mask"][0].tolist(),
                      train_inputs["token_type_ids"][0].tolist(),
                      predLabel)
            mnli_predicted_label.append(tmp)
            pbar.update(1)

## Saving File
cached_features_file = "export/home/Data/Glue/MNLI/imitator/cached_train_BertTokenizer_128_mnli"
lock_path =cached_features_file + ".lock"

with FileLock(lock_path):
    torch.save(mnli_predicted_label, cached_features_file)
    
# This whole thing takes around 1 hour

Params Trainable: 14768643
	bert.encoder.layer.10.attention.self.query.weight
	bert.encoder.layer.10.attention.self.query.bias
	bert.encoder.layer.10.attention.self.key.weight
	bert.encoder.layer.10.attention.self.key.bias
	bert.encoder.layer.10.attention.self.value.weight
	bert.encoder.layer.10.attention.self.value.bias
	bert.encoder.layer.10.attention.output.dense.weight
	bert.encoder.layer.10.attention.output.dense.bias
	bert.encoder.layer.10.attention.output.LayerNorm.weight
	bert.encoder.layer.10.attention.output.LayerNorm.bias
	bert.encoder.layer.10.intermediate.dense.weight
	bert.encoder.layer.10.intermediate.dense.bias
	bert.encoder.layer.10.output.dense.weight
	bert.encoder.layer.10.output.dense.bias
	bert.encoder.layer.10.output.LayerNorm.weight
	bert.encoder.layer.10.output.LayerNorm.bias
	bert.encoder.layer.11.attention.self.query.weight
	bert.encoder.layer.11.attention.self.query.bias
	bert.encoder.layer.11.attention.self.key.weight
	bert.encoder.layer.11.attention.self.ke

  1%|          | 3690/392702 [00:31<55:34, 116.65it/s]  


KeyboardInterrupt: 

In [58]:
# Sanity Check: all non-label fields should be the same as previous cachec training data
#               the prediction accuracy should be about 80% also
import torch
a = torch.load("export/home/Data/Glue/MNLI/cached_train_BertTokenizer_128_mnli")
a_mod = torch.load("export/home/Data/Glue/MNLI/imitator/cached_train_BertTokenizer_128_mnli")
assert len(a)==len(a_mod)

count_same_label = 0
for i in range(len(a)):
    data = a[i]
    data_mod = a_mod[i]
    assert data.input_ids == data_mod.input_ids,"Mismatch input_ids"
    assert data.attention_mask == data_mod.attention_mask, "Mismatch attention mask"
    assert data.token_type_ids == data_mod.token_type_ids, "Mismatch token_type_ids" 
    if(data.label == data_mod.label):
        count_same_label+=1
print("Total of ", count_same_label, " correct predictions out of ", len(a), 
      " equivalent to training accuracy of ", count_same_label/len(a))
# 0.848281903326186
# I think this training accuracy makes sense given our eval accuracy of ~80%

Total of  333122  correct predictions out of  392702  equivalent to training accuracy of  0.8482819033261862


In [59]:
# Test Run works
!python run_glue.py \
    --model_name_or_path bert-base-cased \
    --task_name mnli \
    --do_train \
    --do_eval \
    --data_dir export/home/Data/Glue/MNLI/imitator \
    --max_seq_length 128 \
    --per_device_train_batch_size 128 \
    --learning_rate 2e-5 \
    --num_train_epochs 10.0 \
    --output_dir tuned_param/mnli-imitator-10ep/ \
    --weight_decay 0.005 \
    --save_steps 5000 \
    --logging_steps 100 \
    --save_total_limit 1

10/16/2021 11:12:11 - INFO - transformers.training_args -   PyTorch: setting up devices
10/16/2021 11:12:11 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='tuned_param/mnli-imitator-10ep/', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=128, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=2e-05, weight_decay=0.005, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Oct16_11-12-11_d1022', logging_first_step=False, logging_steps=100, save_steps=5000, save_total_limit=1, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=1000, past_index=-1)
10/16/2021 11:12:11 - INFO - transformers.configuration_utils -   l