## Set up

### install packages

In [3]:
!pip install transformers

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


### import packages

In [4]:
import sys, os, copy, json, random, shutil

from tqdm.notebook import tqdm

import numpy as np

import transformers
from transformers import BertModel, BertTokenizer, AdamW, BertForMultipleChoice
from transformers import AutoModelForMultipleChoice, AutoTokenizer, TrainingArguments, Trainer
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler

#### Connect to gdrive

In [5]:
# from google.colab import drive
# drive.mount("/content/gdrive")

## Config settings

#### Global params

In [6]:
# PROJ_DIR = 'gdrive/MyDrive/595finalproj/PIQA/'
PROJ_DIR = '/home/ec2-user/SageMaker/test/PIQA/'
MAX_LENGTH = 50

In [7]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

TUNED_LAYERS_NUM = '3' # number of layers that we will update

#### Network params

In [8]:
args_epoch = 8
args_lr = 1e-4

In [9]:
args_seeds = 728

In [10]:
args_batch_size = 6

In [11]:
args_workers = 0

In [12]:
args_num_classes = 2

#### Set up device

In [13]:
def set_device():
  if torch.cuda.is_available():
      device = torch.device('cuda:0')
  else:
      device = torch.device('cpu')
  print('using device:', device)
  return device

print(torch.cuda.device_count())
device = set_device()

4
using device: cuda:0


Set up random seeds

In [14]:
def seed_torch(seed=728, is_worker=False):
  if not is_worker:
    os.environ['PYTHONHASHSEED'] = str(seed)
  
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

        
def worker_init_fn(worker_id):
    worker_seed = torch.initial_seed() % 2**30 + worker_id
    seed_torch(worker_seed, True)

#### Util settings

In [15]:
np.set_printoptions(threshold=sys.maxsize)
torch.set_printoptions(threshold=sys.maxsize)
torch.set_printoptions(precision=10)
torch.autograd.set_detect_anomaly(True)
torch.cuda.empty_cache()

## Preprocess

### step 1. Wrap data
input: data files \\
output: required format for the data

In [16]:
def wrap_data(data_name, label_name, max_length_threshold=512, is_test=False):
  data_list = []
  label_list = []
  max_length = -1

  with open(PROJ_DIR+data_name, 'r') as f:
    datas = list(f)
    total_num_sample = len(datas)
    print('num of sample: ', total_num_sample)

  if not is_test:
    with open(PROJ_DIR+label_name, 'r') as f:
      labels = list(f)
      total_num_label = len(labels)
      print('num of label: ', total_num_label)

  total_num = total_num_sample

  for i in range(total_num):
    data = json.loads(datas[i])

    if not is_test:
      ans = labels[i][:-1]

    this_question = data['goal']

    this_sol1 = data['sol1']
    this_sol2 = data['sol2']


    sample = [this_question, this_sol1, this_sol2]

    if len(sample[0]) + len(sample[1]) > max_length_threshold:
      continue
    
    if len(sample[0]) + len(sample[2]) > max_length_threshold:
      continue 

    # max_length = max(max_length, len(sample))

    if not is_test:
#       print(int(ans))
      sample.append(int(ans))
    else: # if test_dataset, set any fake labels
      sample.append(-1)
    data_list.append(sample)

  return data_list, total_num


In [17]:
train_data_list, train_num = wrap_data('train.jsonl', 'train-labels.lst', MAX_LENGTH)
val_data_list, val_num = wrap_data('valid.jsonl', 'valid-labels.lst', MAX_LENGTH)
test_data_list, test_num = wrap_data('tests.jsonl', 'tests-labels.lst', MAX_LENGTH, True)

num of sample:  16113
num of label:  16113
num of sample:  1838
num of label:  1838
num of sample:  3084


In [18]:
len(train_data_list)

2070

### step 2. Tokernizer
input: required format for the data \\
output: tokenize input for the data

In [19]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [20]:
train_data_token = []
val_data_token = []
test_data_token = []

In [21]:

for sample in tqdm(train_data_list):
  prompt, choice0, choice1, labels = sample[0], sample[1], sample[2], sample[3]
  encoding = tokenizer([prompt, prompt], [choice0, choice1], truncation=True)
  encoding['labels'] = labels
  train_data_token.append(encoding)

for sample in tqdm(val_data_list):
  prompt, choice0, choice1, labels = sample[0], sample[1], sample[2], sample[3]
  encoding = tokenizer([prompt, prompt], [choice0, choice1], truncation=True)
  encoding['labels'] = labels
  print(encoding)
  val_data_token.append(encoding)

for sample in tqdm(test_data_list):
  prompt, choice0, choice1, labels = sample[0], sample[1], sample[2], sample[3]
  encoding = tokenizer([prompt, prompt], [choice0, choice1], truncation=True)
  encoding['labels'] = labels
  test_data_token.append(encoding)

  0%|          | 0/2070 [00:00<?, ?it/s]

  0%|          | 0/239 [00:00<?, ?it/s]

{'input_ids': [[101, 16641, 102, 5672, 13065, 2007, 6173, 9231, 102], [101, 16641, 102, 3926, 1010, 3536, 17643, 2378, 2007, 6173, 9231, 102]], 'token_type_ids': [[0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': 1}
{'input_ids': [[101, 14113, 2121, 102, 2064, 2022, 2109, 2004, 1037, 11446, 2006, 1037, 2547, 102], [101, 14113, 2121, 102, 2064, 2022, 2109, 2004, 1037, 11446, 2006, 1037, 28407, 102]], 'token_type_ids': [[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': 0}
{'input_ids': [[101, 6081, 4524, 102, 2064, 4287, 17910, 102], [101, 6081, 4524, 102, 2064, 4287, 6536, 102]], 'token_type_ids': [[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]

  0%|          | 0/399 [00:00<?, ?it/s]

In [22]:
print(val_data_token[0])

{'input_ids': [[101, 16641, 102, 5672, 13065, 2007, 6173, 9231, 102], [101, 16641, 102, 3926, 1010, 3536, 17643, 2378, 2007, 6173, 9231, 102]], 'token_type_ids': [[0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': 1}


In [23]:
len(train_data_token), len(val_data_token), len(test_data_token)

(2070, 239, 399)

### step 3. Make dataset

In [24]:
datasets = {}
datasets['train'] = train_data_token
datasets['validation'] = val_data_token

In [25]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        
        for idx, _ in enumerate(features):
            features[idx]['labels'] = labels[idx]
        
        return batch

In [26]:
accepted_keys = ["input_ids", "token_type_ids", "attention_mask", "labels"]
features = [{k: v for k, v in val_data_token[i].items() if k in accepted_keys} for i in range(10)]
print(features[0])
batch = DataCollatorForMultipleChoice(tokenizer)(features)
print(features[0])

{'input_ids': [[101, 16641, 102, 5672, 13065, 2007, 6173, 9231, 102], [101, 16641, 102, 3926, 1010, 3536, 17643, 2378, 2007, 6173, 9231, 102]], 'token_type_ids': [[0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': 1}
{'input_ids': [[101, 16641, 102, 5672, 13065, 2007, 6173, 9231, 102], [101, 16641, 102, 3926, 1010, 3536, 17643, 2378, 2007, 6173, 9231, 102]], 'token_type_ids': [[0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': 1}


In [27]:
batch['input_ids'].shape

torch.Size([10, 2, 15])

## Training

### Model

In [28]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

In [29]:
model = AutoModelForMultipleChoice.from_pretrained(PRE_TRAINED_MODEL_NAME).to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [30]:
# model = torch.nn.DataParallel(model).cuda()

In [31]:
model_name = 'ziyan-595finalproj'
args = TrainingArguments(
    f"{model_name}-finetuned-piqa",
    evaluation_strategy="epoch",
    learning_rate=args_lr,
    per_device_train_batch_size=args_batch_size,
    per_device_eval_batch_size=args_batch_size,
    num_train_epochs=args_epoch,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy='epoch'
)

### Optimizer

In [32]:
nonfreeze_layers = []

for name, _ in model.named_parameters():
#   print(name)
  if '.' + TUNED_LAYERS_NUM + '.' in name:
    break
  nonfreeze_layers.append(name)

nonfreeze_layers.append("module.classifier.weight")
nonfreeze_layers.append("module.classifier.bias")
print(nonfreeze_layers)
# double-check
# non_freeze_names = [name for name, param in model.named_parameters() if name in nonfreeze_layers]
# print(non_freeze_names)
# optimizer = optim.Adam(model.parameters(), lr=args_lr)
optimizer = AdamW([{'params':[param for name, param in model.named_parameters() if name in nonfreeze_layers]}], lr=args_lr)

['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.

In [33]:
optimizer

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    lr: 0.0001
    weight_decay: 0.0
)

### Run

In [34]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

In [36]:
trainer.train()

***** Running training *****
  Num examples = 2070
  Num Epochs = 8
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 696


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6936,0.688572,0.560669
2,0.6925,0.688282,0.594142
3,0.6794,0.686858,0.564854
4,0.6435,0.669616,0.610879
5,0.5785,0.695661,0.573222
6,0.4986,0.724081,0.610879
7,0.4367,0.803475,0.548117
8,0.3902,0.816629,0.548117


***** Running Evaluation *****
  Num examples = 239
  Batch size = 24
Saving model checkpoint to ziyan-595finalproj-finetuned-piqa/checkpoint-87
Configuration saved in ziyan-595finalproj-finetuned-piqa/checkpoint-87/config.json
Model weights saved in ziyan-595finalproj-finetuned-piqa/checkpoint-87/pytorch_model.bin
tokenizer config file saved in ziyan-595finalproj-finetuned-piqa/checkpoint-87/tokenizer_config.json
Special tokens file saved in ziyan-595finalproj-finetuned-piqa/checkpoint-87/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 239
  Batch size = 24
Saving model checkpoint to ziyan-595finalproj-finetuned-piqa/checkpoint-174
Configuration saved in ziyan-595finalproj-finetuned-piqa/checkpoint-174/config.json
Model weights saved in ziyan-595finalproj-finetuned-piqa/checkpoint-174/pytorch_model.bin
tokenizer config file saved in ziyan-595finalproj-finetuned-piqa/checkpoint-174/tokenizer_config.json
Special tokens file saved in ziyan-595finalproj-finetuned-p

TrainOutput(global_step=696, training_loss=0.5766338973209776, metrics={'train_runtime': 818.0337, 'train_samples_per_second': 20.244, 'train_steps_per_second': 0.851, 'total_flos': 278804770958304.0, 'train_loss': 0.5766338973209776, 'epoch': 8.0})

In [37]:
!mkdir /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/

mkdir: cannot create directory ‘/home/ec2-user/SageMaker/test/saved_model/tune_3_layers/’: File exists


In [41]:
trainer.save_model("/home/ec2-user/SageMaker/test/saved_model/tune_3_layers/")

Saving model checkpoint to /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/
Configuration saved in /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/config.json
Model weights saved in /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/pytorch_model.bin
tokenizer config file saved in /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/tokenizer_config.json
Special tokens file saved in /home/ec2-user/SageMaker/test/saved_model/tune_3_layers/special_tokens_map.json


In [43]:
!zip -r ziyan_tune_3_layers.zip '/home/ec2-user/SageMaker/test/saved_model/tune_3_layers/' 

  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/ (stored 0%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/tokenizer_config.json (deflated 39%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/training_args.bin (deflated 48%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/special_tokens_map.json (deflated 40%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/vocab.txt (deflated 53%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/config.json (deflated 47%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/tokenizer.json (deflated 59%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/pytorch_model.bin (deflated 7%)
  adding: home/ec2-user/SageMaker/test/saved_model/tune_3_layers/.ipynb_checkpoints/ (stored 0%)


In [44]:
assert False

AssertionError: 

## Evaluate

## Test a sample