In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

## Importances

In [3]:
def get_importances():
    # print("this is wrong")
    dir = "./new_importances_data"
    imp_files = os.listdir(dir)
    imp_files = [file for file in imp_files if file.endswith(".pkl")]
    importances = {}
    for imp_file in tqdm(imp_files):
        importances.update(pd.read_pickle(f"{dir}/{imp_file}"))
    return importances

In [4]:
# imps = get_importances()

In [5]:
def get_avg_imporances(importances):
    avg_imps = [torch.zeros_like(imp) for imp in list(importances.values())[0]]
    for token, imps in tqdm(importances.items()):
        for i, layer_imps in enumerate(imps):
            avg_imps[i] += layer_imps / len(importances)
    # TODO think harder about averaging method
    return avg_imps

In [6]:
# avg_importances = get_avg_imporances(imps)

In [7]:
# pd.to_pickle(avg_importances, "./avg_importances.pkl")

In [8]:
avg_importances = pd.read_pickle("./avg_importances.pkl")

In [9]:
len(avg_importances)

24

## Model

In [10]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [12]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [13]:
# tokenizer.decode(token_info.get_prefixes(top_tokens[1000][0], 9, 10)[0])

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Prune Model

In [15]:
from prunners import prune_mlps_individually
from importances import get_mlps

In [16]:
mlps = get_mlps(model)

In [17]:
len(mlps), len(avg_importances)

(24, 24)

In [18]:
avg_importances = dict(zip(mlps, avg_importances))

In [19]:
prune_mlps_individually(avg_importances, 0.2)

In [20]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=6554, bias=True)
          (fc2): Linear(in_features=6554, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

## Replace model modules

In [21]:
from experts import Experts, EmbeddingTokenIdxTracker, mark_only_adapters_as_trainable, prepare_as_if_peft_model, prepare_model_for_gradient_checkpointing
from importances import get_mlps
from post_training import get_lora_config, get_training_arguments



In [22]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [23]:
training_arguments = prepare_as_if_peft_model(model, training_arguments, lora_config)

In [24]:
embed_tokens_new = EmbeddingTokenIdxTracker(model.get_submodule("model").get_submodule("embed_tokens"))

In [25]:
def get_layers(model):
    return model.get_submodule("model").get_submodule("layers")

In [26]:
layers = get_layers(model)

In [27]:
model.get_submodule("model").embed_tokens = embed_tokens_new

In [28]:
for i, layer in enumerate(layers):
    layer.mlp = Experts(
        model,
        layer.mlp,
        lora_config,
        i,
        embed_tokens_new.idx_tracker,
        layer.mlp.config,
    )

In [29]:
mark_only_adapters_as_trainable(model, lora_config)

In [30]:
prepare_model_for_gradient_checkpointing(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (cluster_router): ClusterRouter()
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=6554, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)


In [31]:
examples = ["hi this is an example", "hi this is an example"]

In [32]:
examples = torch.tensor(tokenizer.encode(examples)).view(-1, 5)

In [33]:
model.cuda()

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (cluster_router): ClusterRouter()
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=6554, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)


In [34]:
# _ = model(examples.cuda())

## Train Model

In [35]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer
from peft import LoraConfig
import transformers
from trl import SFTTrainer

In [36]:
dataset = get_baseline_dataset()

reading pickle


In [37]:
# batch_size = 60
# micro_batch_size = 6
# gradient_accumulation_steps = batch_size // micro_batch_size
# training_arguments = transformers.TrainingArguments(
#     per_device_train_batch_size=micro_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     warmup_steps=100,
#     num_train_epochs=2,
#     learning_rate=1e-4,
#     fp16=True,
#     logging_steps=10,
#     logging_first_step=True,
#     # optim=torch.optim,
#     evaluation_strategy="steps",
#     save_strategy="steps",
#     eval_steps=100,
#     save_steps=200,
#     output_dir="./tmp",
#     save_total_limit=20,
#     load_best_model_at_end=True,
#     ddp_find_unused_parameters=None,
#     group_by_length=False,
#     # metric_for_best_model="{}_loss".format(args.data_path),
# )

In [38]:
# model.enable_input_require_grads()

In [39]:
# Setup model for training
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Setup tokenizer for trainign
tokenizer.pad_token = tokenizer.eos_token

train_data, eval_data = dataset["train"], dataset["test"]
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    # peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [40]:
trainer.evaluate()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.416672468185425,
 'eval_runtime': 213.6344,
 'eval_samples_per_second': 9.362,
 'eval_steps_per_second': 1.17}

In [41]:
trainer.train()

Step,Training Loss,Validation Loss
100,3.236,3.190022
200,3.1573,3.144821
300,3.1706,3.128407
400,3.0869,3.118995
500,3.0625,3.111629
600,3.14,3.106526
700,3.0884,3.103107
800,3.1516,3.100001
900,3.0527,3.098915
1000,3.0421,3.09885




SafetensorError: Error while serializing: IoError(Os { code: 5, kind: Uncategorized, message: "Input/output error" })

In [51]:
pd.to_pickle(trainer.state, "./tmp/trainer_state.pkl")

### Evaluation

In [60]:
from evaluation import evaluate_on_nlp_tasks

In [99]:
model.cuda();

In [101]:
model.eval();

In [102]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|██████████████████| 3000/3000 [20:32<00:00,  2.43it/s]
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [62]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.45666666666666667,
  'acc_norm,none': 0.57,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.73,
  'acc_norm,none': 0.7666666666666667,
  'alias': 'piqa'},
 'boolq': {'acc,none': 0.6566666666666666, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.72, 'alias': 'winogrande'}}

In [63]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000)

Using the latest cached version of the dataset since winogrande couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'winogrande_xl' at /home/research/robgarct/.cache/huggingface/datasets/winogrande/winogrande_xl/1.1.0/85ac5b5a3b7a930e22d590176e39460400d19e41 (last modified on Thu Feb 15 14:43:29 2024).
Using the latest cached version of the dataset since super_glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'boolq' at /home/research/robgarct/.cache/huggingface/datasets/super_glue/boolq/1.0.3/b051de3f07b5fd5ab80398a4836458db56234e24 (last modified on Thu Feb 15 14:43:34 2024).
Using the latest cached version of the dataset since piqa couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at /home/research/robgarct/.cache/huggingface/datasets/piqa/plain_text/1.1.0/2e8ac2dffd59bac8c3c6714948f4c551a0848bb0 (last modified on Thu Feb 15 14:41:08 2024).
Using the late

In [64]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.428,
  'acc_norm,none': 0.545,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.743, 'acc_norm,none': 0.751, 'alias': 'piqa'},
 'boolq': {'acc,none': 0.649, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.691, 'alias': 'winogrande'}}

## Save

In [112]:
model.cpu();

In [113]:
torch.save(model.state_dict(), "./tmp/model_state_dict")

## Misc

In [95]:
model.model.layers[0].mlp.experts_fc1[0].lora_A.default.weight

Parameter containing:
tensor([[-7.8082e-03, -1.4024e-02, -6.4943e-03,  ...,  9.5486e-03,
         -4.2409e-03, -1.8877e-02],
        [-6.5831e-03, -1.5125e-02, -7.5041e-03,  ..., -1.1292e-02,
         -1.7146e-02, -1.2725e-03],
        [-1.4931e-02, -1.4116e-02, -2.2513e-02,  ...,  9.7545e-05,
          1.6355e-02, -1.1077e-02],
        ...,
        [ 9.7663e-03, -1.4400e-02, -1.8612e-02,  ...,  2.1575e-02,
          1.2617e-03, -1.5609e-02],
        [-1.0753e-02,  1.6160e-02, -1.1062e-02,  ..., -1.1271e-02,
         -1.5020e-02,  2.0665e-02],
        [ 9.4092e-03,  1.4645e-02, -1.2627e-02,  ...,  1.4613e-02,
         -1.5689e-02,  1.8180e-02]], device='cuda:0', requires_grad=True)

In [94]:
model.model.layers[1].mlp.experts_fc1[0].lora_A.default.weight

Parameter containing:
tensor([[ 0.0465, -0.0024,  0.0215,  ..., -0.0051,  0.0354, -0.0269],
        [ 0.0023, -0.0248, -0.0047,  ..., -0.0262,  0.0047, -0.0208],
        [-0.0111, -0.0166,  0.0031,  ...,  0.0067, -0.0110,  0.0097],
        ...,
        [-0.0154, -0.0031, -0.0254,  ..., -0.0294, -0.0067,  0.0031],
        [ 0.0088,  0.0296, -0.0017,  ...,  0.0020, -0.0049,  0.0049],
        [ 0.0237,  0.0075,  0.0305,  ...,  0.0317,  0.0244, -0.0186]],
       device='cuda:0', requires_grad=True)

In [96]:
model.cpu()

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (cluster_router): ClusterRouter()
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=6554, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)


In [98]:
with torch.no_grad():
    pd.to_pickle(model, "moe-v1.pkl")

AttributeError: Can't pickle local object 'PreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_grads'

In [58]:
list(eval_res.keys())

['results', 'configs', 'versions', 'n-shot', 'config', 'git_hash']