<a href="https://colab.research.google.com/github/NotDachun/huggingface-ark/blob/main/convert_model_to_RFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Uncomment to run this notebook in Google Colab
import os
from getpass import getpass
import urllib

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format

cmd_string = 'git clone https://{0}:{1}@github.com/haopeng-uw/RFA.git'.format(user, password)

!$cmd_string
cmd_string, password = "", "" # removing the password from the variable

User name: notdachun
Password: ··········
fatal: destination path 'RFA' already exists and is not an empty directory.


In [3]:
# Uncomment to run this notebook in Google Colab
# Need to manually remove dataclasses from setup.py, otherwise errors out
%cd /content/RFA
!pip install --editable ./

%cd /content/RFA/fairseq/random_feature_attention
!python setup.py install

%cd /content/RFA/fairseq/linfa
!python setup.py install

/content/RFA
Obtaining file:///content/RFA
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Installing collected packages: fairseq
  Found existing installation: fairseq 1.0.0a0+740a608
    Can't uninstall 'fairseq'. No files were found to uninstall.
  Running setup.py develop for fairseq
Successfully installed fairseq


In [1]:
# Uncomment to run this notebook in Google Colab
!pip install transformers datasets



In [6]:
!nvidia-smi

Sun Jun 20 16:03:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
import logging
import os
import math
import copy
import torch
import torch.nn as nn
from argparse import Namespace
from dataclasses import dataclass, field
from datasets import load_dataset
from transformers import (
    AutoModelForMaskedLM, 
    AutoTokenizer, 
    RobertaForMaskedLM,
    TextDataset, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    PretrainedConfig, 
    AutoConfig, 
    MODEL_FOR_MASKED_LM_MAPPING
)
from fairseq.random_feature_attention.causal_attention import CausalAttention
from fairseq.random_feature_attention.utils import load_random_matrices, sample_random_matrices
from transformers import TrainingArguments, HfArgumentParser

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [13]:
class RFASelfAttention(CausalAttention):
    def __init__(self, args, embed_dim, num_heads, head_dim):
        super().__init__(args=args, embed_dim=embed_dim, num_heads=num_heads, head_dim=head_dim)
        self.random_matrices = None
        self.random_matrix_eval = None
    
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False
    ):
        random_matrix = sample_random_matrices(
            num_layers=1,
            num_heads=self.num_heads,
            random_matrices=self.random_matrices,
            is_training=True)[0] if self.training else self.random_matrix_eval
        
        return (super().forward(hidden_states, random_matrix),)
    
# class RobertaRFAForMaskedLM(RobertaForMaskedLM):
#     def __init__(self, config, rfa_args=None):
#         super().__init__(config)
#         rfa_args.embed_dim = config.hidden_size
#         rfa_args.num_heads = config.num_attention_heads
#         rfa_args.head_dim = int(config.hidden_size / config.num_attention_heads)
#         rfa_args.attn_act = config.hidden_act
        
#         for layer in self.roberta.encoder.layer:                
#             layer.attention.self = RFASelfAttention(args=rfa_args, embed_dim=rfa_args.embed_dim, num_heads=rfa_args.num_heads, 
#                                                                head_dim=rfa_args.head_dim)
    
class AutoModelRFAForMaskedLM(AutoModelForMaskedLM):
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, rfa_args=None, *model_args, **kwargs):
        config = kwargs.pop("config", None)
        if not isinstance(config, PretrainedConfig):
            config, kwargs = AutoConfig.from_pretrained(
                pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
            )
        
            model = MODEL_FOR_MASKED_LM_MAPPING[type(config)]
    
            rfa_args.embed_dim = config.hidden_size
            rfa_args.num_heads = config.num_attention_heads
            rfa_args.head_dim = int(config.hidden_size / config.num_attention_heads)
            rfa_args.attn_act = config.hidden_act

            model.random_matrices = load_random_matrices(
                head_dim=rfa_args.head_dim,
                proj_dim=rfa_args.causal_proj_dim,
                dtype=torch.float32)
            
            random_matrices_eval = sample_random_matrices(
                num_layers=config.num_hidden_layers,
                num_heads=rfa_args.num_heads,
                random_matrices=model.random_matrices,
                is_training=False)
            model.random_matrices_eval = nn.Parameter(random_matrices_eval)
            
            for idx, layer in enumerate(model._modules[config.model_type].encoder.layer):
                rfa_self_attn = RFASelfAttention(args=rfa_args, embed_dim=rfa_args.embed_dim, num_heads=rfa_args.num_heads, 
                                                               head_dim=rfa_args.head_dim)
                rfa_self_attn.random_matrices = model.random_matrices
                rfa_self_attn.random_matrix_eval = model.random_matrices_eval
                
                layer.attention.self = rfa_self_attn
            
            if type(config) in MODEL_FOR_MASKED_LM_MAPPING.keys():
                model.from_pretrained(
                    pretrained_model_name_or_path, *model_args, config=config, **kwargs
                )
            return model
            
        raise ValueError(
            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
            "Model type should be one of {}.".format(
                config.__class__, cls.__name__, ", ".join(c.__name__ for c in MODEL_FOR_MASKED_LM_MAPPING.keys())
            )
        )

In [9]:
block_size = 512

def tokenize_function(examples):
    return tokenizer(examples["text"])

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
    
def pretrain_and_evaluate_load_data(args, model, tokenizer, dataset, eval_only=True, model_path="."):
    datasets = load_dataset(*dataset)
    tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=4,
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["validation"],
        data_collator=data_collator,
    )
        
    eval_results = trainer.evaluate()
    logger.info(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    
    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_results = trainer.evaluate()
        logger.info(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    val_dataset = TextDataset(tokenizer=tokenizer,
                              file_path=args.val_datapath,
                              block_size=512)
    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=args.train_datapath,
                                    block_size=512)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    trainer = Trainer(model=model, args=args, data_collator=data_collator,
                      train_dataset=train_dataset, eval_dataset=val_dataset)

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')
    
    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')

In [10]:
@dataclass
class RFAArguments:        
    random_feature: str = field(default="rrf")
    init_scale: float = field(default=1.0)
    # experiment with different dim
    causal_proj_dim: int = field(default=64, metadata={"help": "projection size for rfa"}) 
    learned_tau: bool = field(default=False)
        
    # experiment with true
    norm_rescale: bool = field(default=False)
    cuda_causal_rfa: bool = field(default=False)
        
    # experiment with true
    use_input_gate: bool = field(default=False)

parser = HfArgumentParser((TrainingArguments, RFAArguments,))

training_args, rfa_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
    '--output_dir', 'tmp',
    '--warmup_steps', '500',
    '--learning_rate', '0.00003',
    '--weight_decay', '0.01',
    '--adam_epsilon', '1e-6',
    '--max_steps', '3000',
    '--logging_steps', '500',
    '--save_steps', '500',
    '--max_grad_norm', '5.0',
    '--per_gpu_eval_batch_size', '8',
    '--per_gpu_train_batch_size', '2',  
    '--gradient_accumulation_steps', '32',
    '--evaluation_strategy', 'epoch',
    '--prediction_loss_only', 'True',
    '--do_train',
    '--do_eval',
])
# training_args.val_datapath = 'wikitext-103-raw/wiki.valid.raw'
# training_args.train_datapath = 'wikitext-103-raw/wiki.train.raw'

# Choose GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [11]:
model = AutoModelRFAForMaskedLM.from_pretrained('roberta-base', rfa_args=rfa_args, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained('roberta-base', use_fast=True, local_files_only=True)
config = model.config

Cannot find the requested files in the cached path and outgoing traffic has been disabled. To enable model look-ups and downloads online, set 'local_files_only' to False.


OSError: ignored

In [None]:
model.state_dict()

In [None]:
model.save_pretraine

In [None]:
ls

In [None]:
model.save_pretrained("roberta-test")

In [None]:
model.__dict__

In [None]:
AutoModelRFAForMaskedLM.from_pretrained("roberta-test")

In [None]:
from transformers import RobertaConfig
model = RobertaRFAForMaskedLM.from_pretrained("roberta-test", rfa_args=rfa_args)

In [None]:
model.state_dict()

In [None]:
model.__dict__

In [None]:
model.from_pretrained("roberta-test", rfa_args=rfa_args)

In [None]:
model.load_state_dict(torch.load("roberta-test/pytorch_model.bin"))

In [None]:
pretrain_and_evaluate_load_data(training_args, model, tokenizer, ('wikitext', 'wikitext-2-raw-v1'), eval_only=False, model_path="roberta-rfa")

In [None]:
from transformers import RobertaForMaskedLM, RobertaTokenizerFast

roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base', local_files_only=True)
roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', local_files_only=True)
logger.info('Evaluating roberta-base (seqlen: 512) for reference ...')
pretrain_and_evaluate_load_data(training_args, roberta_base, roberta_base_tokenizer, ('wikitext', 'wikitext-2-raw-v1'), model_path=training_args.output_dir)

In [None]:
pretrain_and_evaluate_load_data(training_args, roberta_base, roberta_base_tokenizer, ('wikitext', 'wikitext-2-raw-v1'), eval_only=False, model_path="roberta-baseline")

1. Get the roberta data
2. Figure out how to fix the configuration to use RFA