In [180]:
!nvidia-smi

Thu Mar 18 04:56:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:1B:00.0 Off |                  N/A |
| 38%   40C    P8    19W / 250W |   9292MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [117]:
import logging
import os
import math
import copy
import torch
import torch.nn as nn
from argparse import Namespace
from dataclasses import dataclass, field
from datasets import load_dataset
from transformers import (
    AutoModelForMaskedLM, 
    AutoTokenizer, 
    TextDataset, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    PretrainedConfig, 
    AutoConfig, 
    MODEL_FOR_MASKED_LM_MAPPING
)
from fairseq.random_feature_attention.causal_attention import CausalAttention
from fairseq.random_feature_attention.utils import load_random_matrices, sample_random_matrices
from transformers import TrainingArguments, HfArgumentParser

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [157]:
class RFASelfAttention(CausalAttention):
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False
    ):
        random_matrix = sample_random_matrices(
            num_layers=1,
            num_heads=self.num_heads,
            random_matrices=self.random_matrices,
            is_training=True)[0] if self.training else self.random_matrix_eval
        
        return (super().forward(hidden_states, random_matrix),)
    
class AutoModelRFAForMaskedLM(AutoModelForMaskedLM):
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, rfa_args=None, *model_args, **kwargs):
        config = kwargs.pop("config", None)
        if not isinstance(config, PretrainedConfig):
            config, kwargs = AutoConfig.from_pretrained(
                pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
            )

        if type(config) in MODEL_FOR_MASKED_LM_MAPPING.keys():
            model = MODEL_FOR_MASKED_LM_MAPPING[type(config)].from_pretrained(
                pretrained_model_name_or_path, *model_args, config=config, **kwargs
            )
            
            rfa_args.embed_dim = config.hidden_size
            rfa_args.num_heads = config.num_attention_heads
            rfa_args.head_dim = int(config.hidden_size / config.num_attention_heads)
            rfa_args.attn_act = config.hidden_act

            model.random_matrices = load_random_matrices(
                head_dim=rfa_args.head_dim,
                proj_dim=rfa_args.causal_proj_dim,
                dtype=torch.float32)
            
            random_matrices_eval = sample_random_matrices(
                num_layers=config.num_hidden_layers,
                num_heads=rfa_args.num_heads,
                random_matrices=model.random_matrices,
                is_training=False)
            model.random_matrices_eval = nn.Parameter(random_matrices_eval)
            
            for idx, layer in enumerate(model._modules[config.model_type].encoder.layer):
                layer.attention.self = RFASelfAttention(args=rfa_args, embed_dim=rfa_args.embed_dim, num_heads=rfa_args.num_heads, 
                                                               head_dim=rfa_args.head_dim)
                layer.attention.self.random_matrices = model.random_matrices
                layer.attention.self.random_matrix_eval = model.random_matrices_eval
            
            return model
            
        raise ValueError(
            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
            "Model type should be one of {}.".format(
                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MASKED_LM_MAPPING.keys())
            )
        )

In [176]:
block_size = 512

def tokenize_function(examples):
    return tokenizer(examples["text"])

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
    
def pretrain_and_evaluate_load_data(args, model, tokenizer, dataset, eval_only=True, model_path="."):
    datasets = load_dataset(*dataset)
    tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=4,
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["validation"],
        data_collator=data_collator,
    )
        
    eval_results = trainer.evaluate()
    logger.info(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    
    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_results = trainer.evaluate()
        logger.info(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    val_dataset = TextDataset(tokenizer=tokenizer,
                              file_path=args.val_datapath,
                              block_size=512)
    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=args.train_datapath,
                                    block_size=512)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    trainer = Trainer(model=model, args=args, data_collator=data_collator,
                      train_dataset=train_dataset, eval_dataset=val_dataset)

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')
    
    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')

In [177]:
@dataclass
class RFAArguments:        
    random_feature: str = field(default="rrf")
    init_scale: float = field(default=1.0)
    causal_proj_dim: int = field(default=64, metadata={"help": "projection size for rfa"})
    learned_tau: bool = field(default=False)
    norm_rescale: bool = field(default=False)
    cuda_causal_rfa: bool = field(default=False)
    use_input_gate: bool = field(default=False)

parser = HfArgumentParser((TrainingArguments, RFAArguments,))

training_args, rfa_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
    '--output_dir', 'tmp',
    '--warmup_steps', '500',
    '--learning_rate', '0.00003',
    '--weight_decay', '0.01',
    '--adam_epsilon', '1e-6',
    '--max_steps', '3000',
    '--logging_steps', '500',
    '--save_steps', '500',
    '--max_grad_norm', '5.0',
    '--per_gpu_eval_batch_size', '8',
    '--per_gpu_train_batch_size', '2',  
    '--gradient_accumulation_steps', '32',
    '--evaluation_strategy', 'epoch',
    '--prediction_loss_only', 'True',
    '--do_train',
    '--do_eval',
])
# training_args.val_datapath = 'wikitext-103-raw/wiki.valid.raw'
# training_args.train_datapath = 'wikitext-103-raw/wiki.train.raw'

# Choose GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [178]:
model = AutoModelRFAForMaskedLM.from_pretrained('roberta-base', rfa_args=rfa_args)
tokenizer = AutoTokenizer.from_pretrained('roberta-base', use_fast=True)
config = model.config

In [179]:
pretrain_and_evaluate_load_data(training_args, model, tokenizer, ('wikitext', 'wikitext-2-raw-v1'), eval_only=False, model_path=training_args.output_dir)

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


INFO:__main__:Perplexity: 3200635.37
Some weights of the model checkpoint at tmp were not used when initializing RobertaForMaskedLM: ['random_matrices', 'random_matrices_eval', 'roberta.encoder.layer.0.attention.self.tau', 'roberta.encoder.layer.0.attention.self.random_matrices', 'roberta.encoder.layer.0.attention.self.random_matrix_eval', 'roberta.encoder.layer.0.attention.self.q_proj.weight', 'roberta.encoder.layer.0.attention.self.q_proj.bias', 'roberta.encoder.layer.0.attention.self.k_proj.weight', 'roberta.encoder.layer.0.attention.self.k_proj.bias', 'roberta.encoder.layer.0.attention.self.v_proj.weight', 'roberta.encoder.layer.0.attention.self.v_proj.bias', 'roberta.encoder.layer.0.attention.self.out_proj.weight', 'roberta.encoder.layer.0.attention.self.out_proj.bias', 'roberta.encoder.layer.1.attention.self.tau', 'roberta.encoder.layer.1.attention.self.random_matrices', 'roberta.encoder.layer.1.attention.self.random_matrix_eval', 'roberta.encoder.layer.1.attention.self.q_proj.we

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from transformers import RobertaForMaskedLM, RobertaTokenizerFast

roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
logger.info('Evaluating roberta-base (seqlen: 512) for reference ...')
pretrain_and_evaluate_load_data(training_args, roberta_base, roberta_base_tokenizer, ('wikitext', 'wikitext-2-raw-v1'), model_path=training_args.output_dir)

In [None]:
pretrain_and_evaluate_load_data(training_args, roberta_base, roberta_base_tokenizer, ('wikitext', 'wikitext-2-raw-v1'), model_path=training_args.output_dir)