In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Importances

In [4]:
def get_importances():
    # print("this is wrong")
    dir = "./new_importances_data"
    imp_files = os.listdir(dir)
    imp_files = [file for file in imp_files if file.endswith(".pkl")]
    importances = {}
    for imp_file in tqdm(imp_files):
        importances.update(pd.read_pickle(f"{dir}/{imp_file}"))
    return importances

In [5]:
def get_avg_imporances(importances):
    avg_imps = [torch.zeros_like(imp) for imp in list(importances.values())[0]]
    for token, imps in tqdm(importances.items()):
        for i, layer_imps in enumerate(imps):
            avg_imps[i] += layer_imps / len(importances)
    # TODO think harder about averaging method
    return avg_imps

In [6]:
avg_importances = pd.read_pickle("./avg_importances.pkl")

In [7]:
len(avg_importances)

24

## Model

In [8]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [10]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Prune Model

In [12]:
from prunners import prune_mlps_individually
from importances import get_mlps

In [13]:
mlps = get_mlps(model)

In [14]:
len(mlps), len(avg_importances)

(24, 24)

In [15]:
avg_importances = dict(zip(mlps, avg_importances))

In [16]:
prune_mlps_individually(avg_importances, 0.2)

In [17]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=6554, bias=True)
          (fc2): Linear(in_features=6554, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

## Replace model modules

In [18]:
from experts import Experts, mark_adapters_and_routers_as_trainable, prepare_as_if_peft_model, prepare_model_for_gradient_checkpointing
from importances import get_mlps
from post_training import get_lora_config, get_training_arguments

In [19]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [20]:
training_arguments = prepare_as_if_peft_model(model, training_arguments, lora_config)

In [21]:
def get_layers(model):
    return model.get_submodule("model").get_submodule("layers")

In [22]:
def init_experts(model):
    layers = get_layers(model)
    for i, layer in enumerate(layers):
        layer.mlp = Experts(
            model,
            layer.mlp,
            lora_config,
            i,
            layer.mlp.config,
            K=2,
            output_name='moe_20_mlp',
            store_outputs=True # store outputs in eval, need to revert for test
        )

In [23]:
init_experts(model)

In [24]:
#layers = get_layers(model)

In [25]:
mark_adapters_and_routers_as_trainable(model, lora_config)

In [26]:
prepare_model_for_gradient_checkpointing(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (router): TopKPerceptronRouter(
            (fc): Linear(in_features=2048, out_features=8, bias=True)
          )
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=6554, bias=True)
              (lora_dropout): ModuleDict(
                (default)

In [27]:
examples = ["hi this is an example", "hi this is an example"]

In [28]:
examples = torch.tensor(tokenizer.encode(examples)).view(-1, 5)

In [29]:
model.cuda()

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (router): TopKPerceptronRouter(
            (fc): Linear(in_features=2048, out_features=8, bias=True)
          )
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=6554, bias=True)
              (lora_dropout): ModuleDict(
                (default)

## Train Model

In [30]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer
from peft import LoraConfig
import transformers
from trl import SFTTrainer

In [31]:
dataset = get_baseline_dataset()

reading pickle


In [32]:
# batch_size = 60
# micro_batch_size = 6
# gradient_accumulation_steps = batch_size // micro_batch_size
# training_arguments = transformers.TrainingArguments(
#     per_device_train_batch_size=micro_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     warmup_steps=100,
#     num_train_epochs=2,
#     learning_rate=1e-4,
#     fp16=True,
#     logging_steps=10,
#     logging_first_step=True,
#     # optim=torch.optim,
#     evaluation_strategy="steps",
#     save_strategy="steps",
#     eval_steps=100,
#     save_steps=200,
#     output_dir="./tmp",
#     save_total_limit=20,
#     load_best_model_at_end=True,
#     ddp_find_unused_parameters=None,
#     group_by_length=False,
#     # metric_for_best_model="{}_loss".format(args.data_path),
# )

In [33]:
# model.enable_input_require_grads()

In [34]:
training_arguments.save_strategy="no"

In [35]:
# Setup model for training
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Setup tokenizer for trainign
tokenizer.pad_token = tokenizer.eos_token

train_data, eval_data = dataset["train"], dataset["test"]
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    # peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
)

Map: 100%|██████████| 50000/50000 [00:22<00:00, 2249.34 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2225.12 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [36]:
"""print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.numel()} parameters")"""

'print("Trainable parameters:")\nfor name, param in model.named_parameters():\n    if param.requires_grad:\n        print(f"{name}: {param.numel()} parameters")'

In [37]:
trainer.evaluate()

[34m[1mwandb[0m: Currently logged in as: [33mandriv[0m ([33mandriai[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 3.497535228729248,
 'eval_runtime': 294.8665,
 'eval_samples_per_second': 6.783,
 'eval_steps_per_second': 0.848}

In [38]:
def print_expert_stats(model):
    layers = get_layers(model)
    for i, layer in enumerate(layers):    
        experts = layer.mlp
        print(f'layer:{i}, initial_distribution of embeddings to experts:\n {experts.expand_expert_stats()}\n')

In [39]:
print_expert_stats(model)

layer:0, initial_distribution of embeddings to experts:
 {0: (376471, 0.3165251300274457, 0.32638070104866007), 1: (14467, 0.19200466924574935, 0.012542133662542306), 2: (382543, 0.5960781258468969, 0.3316448080230816), 3: (24089, 0.2440519366536918, 0.020883905287687955), 4: (31445, 0.2530766414636875, 0.027261173223103813), 5: (23027, 0.20391130103639182, 0.01996320673583754), 6: (720210, 0.22261037494570812, 0.6243844670698552), 7: (734692, 0.1955460749252302, 0.6369396049492315)}

layer:1, initial_distribution of embeddings to experts:
 {0: (934085, 0.1956401278465879, 0.809802925428619), 1: (32457, 0.24338382158033486, 0.02813852438550741), 2: (82140, 0.2318312082611095, 0.07121109138323253), 3: (32285, 0.6012468226871306, 0.027989409365810354), 4: (42425, 0.25234384911019575, 0.03678025994562503), 5: (1040899, 0.6934197072749734, 0.9024050865560672), 6: (21955, 0.4099594221547316, 0.019033838706097764), 7: (120698, 0.4332449520376379, 0.10463886422904067)}

layer:2, initial_distr

In [40]:
def reset_all_expert_stats(model):
    layers = get_layers(model)
    for i, layer in enumerate(layers):    
        experts = layer.mlp
        experts.reset_expert_stats()

In [41]:
def dump_all_expert_stats(model):
    layers = get_layers(model)
    for i, layer in enumerate(layers):
        experts = layer.mlp
        experts.dump_expert_stats()

In [42]:
dump_all_expert_stats(model)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
pd.to_pickle(trainer.state, "./tmp/trainer_state_20.pkl")

### Evaluation

In [None]:
from evaluation import evaluate_on_nlp_tasks

In [None]:
model.cuda();

In [None]:
model.eval();

In [None]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300)

In [None]:
eval_res["results"]

In [None]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000)

In [None]:
eval_res["results"]

## Save

In [None]:
model.cpu();

In [None]:
torch.save(model.state_dict(), "./tmp/model_state_dict_20")

## Misc

In [None]:
model.model.layers[0].mlp.experts_fc1[0].lora_A.default.weight

In [None]:
model.model.layers[1].mlp.experts_fc1[0].lora_A.default.weight

In [None]:
model.cpu()