In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
pruning_ratio = 20 # edit
pruning_method = "magnitude" # edit, can also be "random" 

In [4]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Importances

## Model

In [5]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [7]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Prune Model

In [9]:
from baseline_pruners import prune_mlps, prune_mlp_magnitude, prune_mlp_random
from importances import get_mlps

In [10]:
mlps = get_mlps(model)

In [11]:
pruning_fn = prune_mlp_random if pruning_method == 'random' else prune_mlp_magnitude

In [12]:
prune_mlps(mlps, pruning_ratio/100, pruning_fn)

In [13]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=6554, bias=True)
          (fc2): Linear(in_features=6554, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

## Metric Callback

In [14]:
from transformers import TrainerCallback

In [15]:
from evaluation import evaluate_on_nlp_tasks

In [16]:
class AccEvalCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.last_step=-1

    def on_evaluate(self, args, state, control, model, **kwargs):
        if state.global_step == self.last_step:
            return
        self.last_step = state.global_step
        train = model.training
        model.eval()
        with torch.no_grad():
            os.environ["TQDM_DISABLE"] = "1"
            eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=100, do_shuffle=True)["results"]
            # import pdb; pdb.set_trace()
            eval_res = {k:v["acc,none"] for k,v in eval_res.items()}
            for k, v in eval_res.items():
                state.log_history.append(
                    {
                        k:v,
                        "epoch":state.epoch,
                        "step":state.global_step,
                    }
                )
            del os.environ['TQDM_DISABLE']
            print(eval_res)
        model.train(train)

In [17]:
class SaveCallback(TrainerCallback):
    def __init__(self, save_path):
        super().__init__()
        self.save_path = save_path
        self.last_step=-1

    def on_evaluate(self, args, state, control, model, **kwargs):
        if state.global_step == self.last_step:
            return
        self.last_step = state.global_step
        try:
            torch.save(model.state_dict(), self.save_path)
        except Exception as e:
            print(f"error saving {e}")

## Train model

In [18]:
from peft import LoraConfig, PeftConfig
import transformers

In [19]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer

In [20]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [21]:
model.cuda();

In [22]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [23]:
dataset = get_baseline_dataset()
train_data, eval_data = dataset["train"], dataset["test"]

reading pickle


In [24]:
tokenizer.pad_token = tokenizer.eos_token
training_arguments.save_strategy="no"
training_arguments.eval_steps = 100

In [25]:
save_path = f"./tmp/{pruning_method}_{pruning_ratio}_state_dict"
callbacks = [AccEvalCallback(), SaveCallback(save_path)]

In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
    callbacks=callbacks,
)

Map: 100%|██████████| 50000/50000 [00:27<00:00, 1806.43 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1743.68 examples/s]


## Before training performance evaluation

In [27]:
from evaluation import evaluate_on_nlp_tasks

In [None]:
trainer.evaluate()

[34m[1mwandb[0m: Currently logged in as: [33mandriv[0m ([33mandriai[0m). Use [1m`wandb login --relogin`[0m to force relogin




will shuffle dataset


In [None]:
model.eval()

In [None]:
eval_res_orig = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=False)

In [None]:
eval_res_orig["results"]

In [None]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=True)

In [None]:
eval_res["results"]

## Training

In [None]:
model.train()

In [None]:
train_res = trainer.train()

In [None]:
pd.to_pickle(trainer.state, f"./tmp/{pruning_method}_{pruning_ratio}.pkl")

In [None]:
trainer_state = trainer.state
pd.DataFrame(trainer_state.log_history).tail()

## Post-training performance evaluation:

In [None]:
valid_loss = pd.DataFrame(trainer_state.log_history)[["step", "eval_loss"]].set_index("step").dropna()

In [None]:
valid_loss

In [None]:
valid_loss.plot()

In [None]:
from evaluation import evaluate_on_nlp_tasks

In [None]:
model.eval();

In [None]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300, do_shuffle=True)

In [None]:
eval_res["results"]

In [None]:
eval_res_orig = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=False)

In [None]:
eval_res_orig["results"]

In [None]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000, bootstrap_iters=1000, do_shuffle=True)

In [None]:
eval_res["results"]

# Save

In [None]:
model.cpu();

In [None]:
torch.save(model.state_dict(), save_path)

# Stats

In [None]:
import pandas as pd

In [None]:
trainer_state = pd.read_pickle(f"tmp_logs/{pruning_method}_{pruning_ratio}.pkl")

In [None]:
df = pd.DataFrame(trainer_state.log_history)

In [None]:
metrics_df = df[["step", "hellaswag", "piqa", "boolq", "winogrande"]]

In [None]:
metrics_df.ffill().tail()

In [None]:
metrics_df[["step", "boolq"]].dropna().set_index("step").plot()

In [None]:
metrics_df[["step", "hellaswag"]].dropna().set_index("step").plot()

In [None]:
metrics_df = df[["step", "piqa"]].dropna().set_index("step").plot()

In [None]:
metrics_df = df[["step", "winogrande"]].dropna().set_index("step").plot()