In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

## Importances

In [3]:
def get_importances():
    # print("this is wrong")
    dir = "./new_importances_data"
    imp_files = os.listdir(dir)
    imp_files = [file for file in imp_files if file.endswith(".pkl")]
    importances = {}
    for imp_file in tqdm(imp_files):
        importances.update(pd.read_pickle(f"{dir}/{imp_file}"))
    return importances

In [4]:
# imps = get_importances()

In [5]:
def get_avg_imporances(importances):
    avg_imps = [torch.zeros_like(imp) for imp in list(importances.values())[0]]
    for token, imps in tqdm(importances.items()):
        for i, layer_imps in enumerate(imps):
            avg_imps[i] += layer_imps / len(importances)
    # TODO think harder about averaging method
    return avg_imps

In [6]:
# avg_importances = get_avg_imporances(imps)

In [7]:
# pd.to_pickle(avg_importances, "./avg_importances.pkl")

In [8]:
avg_importances = pd.read_pickle("./avg_importances.pkl")

In [9]:
len(avg_importances)

24

## Model

In [10]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [12]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [13]:
# tokenizer.decode(token_info.get_prefixes(top_tokens[1000][0], 9, 10)[0])

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Prune Model

In [15]:
from prunners import prune_mlps_individually
from importances import get_mlps

In [16]:
mlps = get_mlps(model)

In [17]:
len(mlps), len(avg_importances)

(24, 24)

In [18]:
avg_importances = dict(zip(mlps, avg_importances))

In [19]:
prune_mlps_individually(avg_importances, 0.2)

In [20]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=6554, bias=True)
          (fc2): Linear(in_features=6554, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

## Train model

In [22]:
from peft import LoraConfig, PeftConfig
import transformers

In [38]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer

In [25]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [26]:
training_arguments.save_steps = 400

In [21]:
model.cuda();

In [27]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [30]:
dataset = get_baseline_dataset()
train_data, eval_data = dataset["train"], dataset["test"]

reading pickle


In [31]:
tokenizer.pad_token = tokenizer.eos_token

In [39]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [57]:
trainer.evaluate()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.416516065597534,
 'eval_runtime': 160.8941,
 'eval_samples_per_second': 12.431,
 'eval_steps_per_second': 1.554}

In [58]:
train_res = trainer.train()

Step,Training Loss,Validation Loss
100,3.2353,3.18918
200,3.157,3.145626
300,3.1731,3.131249
400,3.0917,3.124124
500,3.0686,3.118918
600,3.1467,3.115085
700,3.0962,3.112436
800,3.162,3.109956
900,3.0896,3.108312
1000,3.0821,3.107


Checkpoint destination directory ./tmp/checkpoint-400 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./tmp/checkpoint-800 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./tmp/checkpoint-1200 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [76]:
pd.to_pickle(trainer.state, "./tmp/trainer_state_llm_pruner_style.pkl")

In [74]:
trainer_state = trainer.state
pd.DataFrame(trainer_state.log_history).tail()

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
179,3.1262,2.298851e-06,1.96,1630,,,,,,,,,
180,3.1172,1.660281e-06,1.97,1640,,,,,,,,,
181,3.147,1.021711e-06,1.98,1650,,,,,,,,,
182,3.1517,3.831418e-07,1.99,1660,,,,,,,,,
183,,,2.0,1666,,,,,26039.1643,3.84,0.064,3.480373e+17,3.138039


### Evaluation

In [63]:
from evaluation import evaluate_on_nlp_tasks

In [64]:
model.eval();

In [65]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|██████████████████| 3000/3000 [01:32<00:00, 32.57it/s]
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [66]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.44333333333333336,
  'acc_norm,none': 0.56,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.72, 'acc_norm,none': 0.77, 'alias': 'piqa'},
 'boolq': {'acc,none': 0.6466666666666666, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.7133333333333334, 'alias': 'winogrande'}}

In [67]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|████████████████| 10000/10000 [05:06<00:00, 32.66it/s]
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [68]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.422,
  'acc_norm,none': 0.539,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.736, 'acc_norm,none': 0.755, 'alias': 'piqa'},
 'boolq': {'acc,none': 0.637, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.678, 'alias': 'winogrande'}}

## Save

In [79]:
model.cpu();

In [80]:
torch.save(model.state_dict(), "./tmp/model_llm_prunner_style_state_dict")