In [None]:
from datasets import load_dataset
import glob

In [None]:
book_dataset = load_dataset('bookcorpusopen', split="train", cache_dir='/mounts/data/proj/xinpeng/huggingface/bookcorpusopen')

In [None]:
book_dataset[0]

In [None]:
class BookscorpusTextFormatting:
    def __init__(self, books_path, output_filename, recursive=False):
        self.books_path = books_path
        self.recursive = recursive
        self.output_filename = output_filename

    # This puts one book per line
    def merge(self):
        with open(self.output_filename, mode="w", newline="\n") as ofile:
            for filename in glob.glob(self.books_path + "/" + "*.txt", recursive=True):
                with open(filename, mode="r", encoding="utf-8-sig", newline="\n") as file:
                    for line in file:
                        if line.strip() != "":
                            ofile.write(line.strip() + " ")
                ofile.write("\n\n")

In [None]:
### creat a book toy ####
book_toy_path = '/mounts/data/proj/xinpeng/toy/book_1.txt'

In [None]:
### write huggingface book corpus to book toy ###
with open (book_toy_path, 'w+', newline='\n') as f:
    for idx, i in enumerate(book_dataset):
        doc_raw = i['text']
        f.write(doc_raw)
        if idx == 100:
            break

In [None]:
book_toy_wrapper = open(book_toy_path, mode='r', newline='\n')
for i in book_toy_wrapper:
    print(i)
    break
book_toy_wrapper.close()

In [None]:
book_budget = '/mounts/data/proj/xinpeng/budget/bookcorpus_one_article_per_line.txt'
with open(book_budget, 'r') as f:
    a=f.readline()

### Strong Teacher

In [None]:

import torch.nn as nn


def cosine_similarity(a, b, eps=1e-8):
    return (a * b).sum(1) / (a.norm(dim=1) * b.norm(dim=1) + eps)


def pearson_correlation(a, b, eps=1e-8):
    return cosine_similarity(a - a.mean(1).unsqueeze(1),
                             b - b.mean(1).unsqueeze(1), eps)


def inter_class_relation(y_s, y_t):
    return 1 - pearson_correlation(y_s, y_t).mean()


def intra_class_relation(y_s, y_t):
    return inter_class_relation(y_s.transpose(0, 1), y_t.transpose(0, 1))


class DIST(nn.Module):
    def __init__(self, beta=1.0, gamma=1.0):
        super(DIST, self).__init__()
        self.beta = beta
        self.gamma = gamma

    def forward(self, z_s, z_t):
        y_s = z_s.softmax(dim=1)
        y_t = z_t.softmax(dim=1)
        inter_loss = inter_class_relation(y_s, y_t)
        intra_loss = intra_class_relation(y_s, y_t)
        kd_loss = self.beta * inter_loss + self.gamma * intra_loss
        return kd_loss

In [None]:
import torch
dist = DIST()

In [None]:
a = torch.rand([32, 16, 128, 128])
b = torch.rand([32, 16, 128, 128])

In [None]:
dist(a,b)

In [None]:
def cosine_similarity(a, b, eps=1e-8):
    return (a * b).sum(-1) / (a.norm(dim=-1) * b.norm(dim=-1) + eps)


def pearson_correlation(a, b, eps=1e-8):
    return cosine_similarity(a - a.mean(-1).unsqueeze(-1),
                             b - b.mean(-1).unsqueeze(-1), eps)


def inter_class_relation(y_s, y_t):
    return 1 - pearson_correlation(y_s, y_t).mean()



class DIST_ATT(nn.Module):
    def __init__(self, beta=1.0, gamma=1.0):
        super(DIST_ATT, self).__init__()
        self.beta = beta
        self.gamma = gamma

    def forward(self, z_s, z_t):
        y_s = z_s.softmax(dim=-1)
        y_t = z_t.softmax(dim=-1)
        inter_token_1 = inter_class_relation(y_s, y_t)
        inter_token_2 = inter_class_relation(y_s.transpose(2, 3), y_t.transpose(2, 3))
        inter_head = inter_class_relation(y_s.transpose(1, 3), y_t.transpose(1, 3))
        inter_sentence = inter_class_relation(y_s.transpose(0, 3), y_t.transpose(0, 3))
        kd_loss = inter_token_1 + inter_token_2 + inter_head + inter_sentence
        return kd_loss

In [None]:
loss=DIST_ATT()

In [None]:
loss(a,b)

### save teacher for finetune

In [None]:
dict={
  "add_nsp": False,
  "async_worker": True,
  "attention_dropout_checkpoint": False,
  "current_run_id": "",
  "data_loader_type": "dist",
  "dataset_path": "/mounts/Users/student/xinpeng/data/budget/masked",
  "deepspeed": False,
  "deepspeed_config": "training-out/pretraining_experiment-/epoch1000000_step14998/deepspeed_config.json",
  "deepspeed_transformer_kernel": False,
  "do_validation": False,
  "ds_config": {
    "fp16": {
      "enabled": True,
      "hysteresis": 2,
      "loss_scale": 0,
      "loss_scale_window": 1000,
      "min_loss_scale": 1
    },
    "gradient_clipping": 0.0,
    "steps_per_print": 100,
    "train_batch_size": 4096,
    "train_micro_batch_size_per_gpu": 64,
    "wall_clock_breakdown": False
  },
  "early_exit_time_marker": 24.0,
  "early_stop_eval_loss": 6.0,
  "early_stop_time": 180,
  "exp_start_marker": 10749109.734136138,
  "finetune_checkpoint_at_end": True,
  "fp16": True,
  "fp16_backend": "ds",
  "fp16_opt": "O2",
  "gelu_checkpoint": False,
  "gradient_accumulation_steps": 8,
  "gradient_clipping": 0.0,
  "job_name": "pretraining_experiment-",
  "learning_rate": 0.001,
  "local_rank": 0,
  "log_throughput_every": 20,
  "lr": 0.001,
  "max_predictions_per_seq": 20,
  "max_steps": 9223372036854775807,
  "max_steps_per_epoch": 9223372036854775807,
  "model_config": {
    "attention_probs_dropout_prob": 0.1,
    "encoder_ln_mode": "post-ln",
    "fused_linear_layer": False,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_type": "pytorch",
    "layernorm_embedding": True,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "sparse_mask_prediction": True,
    "type_vocab_size": 2,
    "vocab_size": 30522,
  },
  "model_type": "bert-mlm",
  "no_nsp": True,
  "normalize_invertible": False,
  "num_epochs": 1000000,
  "num_epochs_between_checkpoints": 10000,
  "num_workers": 4,
  "output_dir": "training-out",
  "prescale_gradients": False,
  "print_steps": 100,
  "project_name": "budget-bert-pretraining",
  "saved_model_path": "training-out/pretraining_experiment-/",
  "scale_cnt_limit": 100,
  "seed": 42,
  "steps_per_print": 100,
  "stochastic_mode": False,
  "tokenizer_name": "bert-base-uncased",
  "total_training_time": 24.0,
  "train_batch_size": 4096,
  "train_micro_batch_size_per_gpu": 64,
  "use_early_stopping": True,
  "validation_begin_proportion": 0.05,
  "validation_end_proportion": 0.01,
  "validation_epochs": 3,
  "validation_epochs_begin": 1,
  "validation_epochs_end": 1,
  "validation_micro_batch": 16,
  "vocab_size": 30522,
  "wall_clock_breakdown": False
}

: 

In [3]:

%load_ext autoreload
%autoreload 2
from attrdict import AttrDict
import os
os.chdir('/mounts/Users/student/xinpeng/code/academic-budget-bert')
from pretraining.base import BasePretrainModel, PretrainedBertConfig, BertForMaskedLM
from pretraining.modeling import BertLMHeadModel
from transformers import BertModel
import torch
# from pretraining.utils import budget_to_huggingface

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
batch = torch.load('notebooks/data/batch_toy.pt')


In [6]:
model_args =AttrDict(dict)
student = BasePretrainModel(model_args)
student.network = BertLMHeadModel.from_pretrained_customized('models/teachers/bert-base-uncased', args=model_args)
student.network.eval()
student.network.to("cuda")
with torch.no_grad():
    attentions_teacher, qkv_teacher, prediction_score_teacher = \
                student.network(batch, output_attentions=True, output_qkv=True, output_loss=False)

07/28/2022 21:02:29 - INFO - pretraining.base -   Loading default tokenizer bert-base-uncased
07/28/2022 21:02:31 - INFO - pretraining.base -   Loading config from args


()
{}
()
{}


07/28/2022 21:02:32 - INFO - pretraining.modeling -   Init BERT pretrain model
07/28/2022 21:02:34 - INFO - pretraining.modeling -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "encoder_ln_mode": "post-ln",
  "fused_linear_layer": false,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "hugging_face": true,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "layer_norm_type": "pytorch",
  "layernorm_embedding": true,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sparse_mask_prediction": false,
  "transformers_version": "4.21.0.dev0",
  "type_vocab_size": 2,
  "useLN": true,
  "use_cache": true,
  "vocab_size": 30522
}



()
{}
()
{}


07/28/2022 21:02:34 - INFO - pretraining.modeling -   Init BERT pretrain model
07/28/2022 21:02:36 - INFO - pretraining.modeling -   Loading model models/teachers/bert-base-uncased/pytorch_model.bin
07/28/2022 21:02:37 - INFO - pretraining.modeling -   loading model...
07/28/2022 21:02:37 - INFO - pretraining.modeling -   done!
07/28/2022 21:02:37 - INFO - pretraining.modeling -   Weights of BertLMHeadModel not initialized from pretrained model: ['bert.encoder.FinalLayerNorm.weight', 'bert.encoder.FinalLayerNorm.bias', 'cls.predictions.decoder.bias']
07/28/2022 21:02:37 - INFO - pretraining.modeling -   Weights from pretrained model not used in BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']


In [18]:
a=BertLMHeadModel.from_pretrained_customized('models/teachers/bert-base-uncased/', args=None)
a.to("cuda")
a.eval()
a(batch)

07/28/2022 17:24:54 - INFO - pretraining.modeling -   Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "encoder_ln_mode": "post-ln",
  "fused_linear_layer": false,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "hugging_face": true,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "layer_norm_type": "pytorch",
  "layernorm_embedding": true,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sparse_mask_prediction": false,
  "transformers_version": "4.21.0.dev0",
  "type_vocab_size": 2,
  "useLN": true,
  "use_cache": true,
  "vocab_size": 30522
}



()
{}
()
{}


07/28/2022 17:24:55 - INFO - pretraining.modeling -   Init BERT pretrain model
07/28/2022 17:24:56 - INFO - pretraining.modeling -   Loading model models/teachers/bert-base-uncased/pytorch_model.bin
07/28/2022 17:24:57 - INFO - pretraining.modeling -   loading model...
07/28/2022 17:24:57 - INFO - pretraining.modeling -   done!
07/28/2022 17:24:57 - INFO - pretraining.modeling -   Weights of BertLMHeadModel not initialized from pretrained model: ['bert.encoder.FinalLayerNorm.weight', 'bert.encoder.FinalLayerNorm.bias', 'cls.predictions.decoder.bias']
07/28/2022 17:24:57 - INFO - pretraining.modeling -   Weights from pretrained model not used in BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']


In [1]:
checkpoint_id = f"bert_base"
student.save_weights(
    checkpoint_id=checkpoint_id,
    output_dir="training-out/bert_base/",
    is_deepspeed=model_args.deepspeed,
)


NameError: name 'student' is not defined

In [7]:
model_args.deepspeed

False

In [None]:
state_dict=student.network.state_dict().copy()

In [None]:
state_dict = budget_to_huggingface(state_dict)

In [None]:
state_dict.keys()

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)

In [None]:
num_labels=2

config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task='rte',
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
    )

In [None]:
from transformers import  BertModel

In [None]:
model = BertModel()