Installing and pulling github

In [1]:
# !rm -rf cs229-project-light
# !git clone https://github.com/Roberto09/cs229-project-light.git

In [2]:
# !mkdir tmp

In [3]:
# !mv cs229-project-light/cluster_pkl/ ./

In [4]:
# !huggingface-cli login

In [5]:
# # ! rm -rf lm-evaluation-harness
# ! cd cs229-project-light; git clone https://github.com/EleutherAI/lm-evaluation-harness
# ! cd cs229-project-light/lm-evaluation-harness; git reset --hard 4d7d2f64576205105318fd12a622b6f0b7c70464
# ! cd cs229-project-light/lm-evaluation-harness; pip install -e .

In [6]:
# !pip install datasets

In [7]:
# !pip install trl

In [8]:
# !pip install loguru

In [9]:
# !pip install accelerate -U --force-reinstall

In [10]:
# !pip install peft

In [11]:
# !pip install peft==0.7.1

In [12]:
# from peft.tuners.lora.layer import Linear
# Linear??

# Imports

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import sys
sys.path.append("./cs229-project-light/")

In [15]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

# Importances

In [16]:
avg_importances = pd.read_pickle("./cs229-project-light/avg_importances.pkl")

In [17]:
len(avg_importances)

24

# Model

In [18]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [21]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

# Prune Model

In [22]:
from prunners import prune_mlps_individually
from importances import get_mlps

In [23]:
mlps = get_mlps(model)

In [24]:
len(mlps), len(avg_importances)

(24, 24)

In [25]:
avg_importances = dict(zip(mlps, avg_importances))

In [26]:
prune_mlps_individually(avg_importances, 0.2)

# Dataset

In [27]:
from other_datasets import get_minipile, get_c4, get_wikitext2_filtered, get_bookcorpus, get_alpaca, QADataCollator, to_dataset
from dataset import get_baseline_dataset

In [28]:
tiny_text = get_baseline_dataset()
alpaca = get_alpaca(tokenizer, n=2000, do_split=False)
# eval datasets
# tiny_text = get_baseline_dataset()["test"]
# c4 = get_c4(n=2000, do_split=False)
# minipile = get_minipile(n=2000, do_split=False)
# wikitext = get_wikitext2_filtered(n=2000, do_split=False)
# bookcorpus = get_bookcorpus(n=2000, do_split=False)

reading pickle


[32m2024-03-09 00:51:16.210[0m | [1mINFO    [0m | [36mother_datasets[0m:[36m__init__[0m:[36m309[0m - [1mMean length of tokens per window: 114.453[0m


# Metric Callback

In [29]:
from transformers import TrainerCallback

In [30]:
from evaluation import evaluate_on_nlp_tasks

In [31]:
class AccEvalCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.last_step=-1

    def on_evaluate(self, args, state, control, model, **kwargs):
        if state.global_step == self.last_step:
            return
        self.last_step = state.global_step
        train = model.training
        model.eval()
        with torch.no_grad():
            os.environ["TQDM_DISABLE"] = "1"
            eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=100, do_shuffle=True)["results"]
            # import pdb; pdb.set_trace()
            eval_res = {k:v["acc,none"] for k,v in eval_res.items()}
            for k, v in eval_res.items():
                state.log_history.append(
                    {
                        k:v,
                        "epoch":state.epoch,
                        "step":state.global_step,
                    }
                )
            del os.environ['TQDM_DISABLE']
            print(eval_res)
        model.train(train)

In [32]:
class EnableMLPBias(TrainerCallback):
    def on_init_end(self, args, state, control, model, **kwargs):
        for n, p in model.named_parameters():
            if "base_layer" in n and "bias" in n:
                p.requires_grad = True

# Replace model modules

In [33]:
from experts import Experts, EmbeddingTokenIdxTracker, mark_adapters_and_routers_as_trainable, prepare_as_if_peft_model, prepare_model_for_gradient_checkpointing
from importances import get_mlps
from post_training import get_lora_config, get_training_arguments

In [34]:
lora_config = get_lora_config(r=64)
training_arguments = get_training_arguments("./tmp")

In [35]:
training_arguments = prepare_as_if_peft_model(model, training_arguments, lora_config)

In [36]:
embed_tokens_new = EmbeddingTokenIdxTracker(model.get_submodule("model").get_submodule("embed_tokens"))

In [37]:
def get_layers(model):
    return model.get_submodule("model").get_submodule("layers")

In [38]:
layers = get_layers(model)

In [39]:
model.get_submodule("model").embed_tokens = embed_tokens_new

In [40]:
for i, layer in enumerate(layers):
    layer.mlp = Experts(
        model,
        layer.mlp,
        lora_config,
        i,
        layer.mlp.config,
        K=2, # use mlp router
        # curr_token_idx_tracker=embed_tokens_new.idx_tracker,
        cluster_init_router=False, # do not initialize mlp router
    )

In [41]:
mark_adapters_and_routers_as_trainable(model, lora_config)

In [42]:
prepare_model_for_gradient_checkpointing(model);

In [43]:
model.cuda();

In [44]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (router): TopKPerceptronRouter(
            (fc): Linear(in_features=2048, out_features=8, bias=True)
          )
          (experts_fc1): ModuleList(
            (0-7): 8 x OnlyLowRankLora(
              (orig_lora): lora.Linear(
                (base_layer): Linear(in_features=2048, out_

# Train Model

In [45]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer
from peft import LoraConfig
import transformers
from trl import SFTTrainer
from other_datasets import SFTTrainer_

In [46]:
# Setup model for training
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [47]:
# train_data, eval_data = minipile["train"], minipile["test"]
train_data, eval_data = tiny_text["train"], tiny_text["test"]
eval_datasets = {
    "tiny_text":eval_data,
    "alpaca":alpaca,
    # "minipile":minipile,
    # "c4":c4,
    # "wikitext":wikitext,
    # "tiny_text":tiny_text,
    # "bookcorpus":bookcorpus,
}

In [48]:
callbacks = [AccEvalCallback(), EnableMLPBias()]

In [49]:
tokenizer.pad_token = tokenizer.eos_token
training_arguments.save_strategy="no"
training_arguments.eval_steps = 100

In [50]:
trainer = SFTTrainer_(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_datasets,
    # peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
    callbacks=callbacks,
    data_collator=QADataCollator(tokenizer),
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [51]:
# trainer.evaluate(eval_datasets["tiny_text"].select(range(200)))

In [52]:
# model.model.layers[0].mlp.router.cnt

In [53]:
trainer.evaluate(eval_datasets["tiny_text"])



will shuffle dataset


Downloading data: 100%|██████████| 2.06M/2.06M [00:02<00:00, 791kB/s]
Downloading data: 100%|██████████| 118k/118k [00:00<00:00, 440kB/s]
Downloading data: 100%|██████████| 85.9k/85.9k [00:00<00:00, 318kB/s]


Generating train split:   0%|          | 0/40398 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Downloading data: 100%|██████████| 3.85M/3.85M [00:00<00:00, 7.32MB/s]
Downloading data: 100%|██████████| 1.31M/1.31M [00:00<00:00, 3.54MB/s]
Downloading data: 100%|██████████| 1.31M/1.31M [00:07<00:00, 180kB/s]


Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

Downloading data: 100%|██████████| 2.66M/2.66M [00:03<00:00, 863kB/s]
Downloading data: 100%|██████████| 502k/502k [00:00<00:00, 1.79MB/s]
Downloading data: 100%|██████████| 301k/301k [00:00<00:00, 1.04MB/s]


Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.53k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

Map:   0%|          | 0/39905 [00:00<?, ? examples/s]

Map:   0%|          | 0/10042 [00:00<?, ? examples/s]

100%|██████████| 1000/1000 [06:12<00:00,  2.69it/s]


{'hellaswag': 0.39, 'piqa': 0.73, 'boolq': 0.6, 'winogrande': 0.66}


{'eval_loss': 3.4151501655578613,
 'eval_runtime': 160.8074,
 'eval_samples_per_second': 12.437,
 'eval_steps_per_second': 1.555}

In [54]:
train_res = trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
'a'

'a'

In [None]:
pd.to_pickle(trainer.state, "./tmp/trainer_state.pkl")

In [None]:
trainer_state = trainer.state
pd.DataFrame(trainer_state.log_history).tail()

# Evaluation

In [None]:
from evaluation import evaluate_on_nlp_tasks

In [None]:
model.eval();

In [None]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300, do_shuffle=True)

In [None]:
eval_res["results"]

In [None]:
eval_res_orig = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=False)

In [None]:
eval_res_orig["results"]

In [None]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=True)

In [None]:
eval_res["results"]

# Save

In [None]:
model.cpu();

In [None]:
torch.save(model.state_dict(), "./tmp/model_state_dict")

# Stats

In [None]:
df = pd.DataFrame(trainer_state.log_history)

In [None]:
metrics_df = df[["step", "hellaswag", "piqa", "boolq", "winogrande"]]

In [None]:
metrics_df[["step", "boolq"]].dropna().set_index("step").plot()

In [None]:
metrics_df[["step", "hellaswag"]].dropna().set_index("step").plot()

In [None]:
metrics_df = df[["step", "piqa"]].dropna().set_index("step").plot()

In [None]:
metrics_df = df[["step", "winogrande"]].dropna().set_index("step").plot()