In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

## Importances

In [5]:
def get_importances():
    # print("this is wrong")
    dir = "./new_importances_data"
    imp_files = os.listdir(dir)
    imp_files = [file for file in imp_files if file.endswith(".pkl")]
    importances = {}
    for imp_file in tqdm(imp_files):
        importances.update(pd.read_pickle(f"{dir}/{imp_file}"))
    return importances

In [6]:
# imps = get_importances()

In [7]:
def get_avg_imporances(importances):
    avg_imps = [torch.zeros_like(imp) for imp in list(importances.values())[0]]
    for token, imps in tqdm(importances.items()):
        for i, layer_imps in enumerate(imps):
            avg_imps[i] += layer_imps / len(importances)
    # TODO think harder about averaging method
    return avg_imps

In [8]:
# avg_importances = get_avg_imporances(imps)

In [9]:
# pd.to_pickle(avg_importances, "./avg_importances.pkl")

In [10]:
!ls ../

baseline_dataset.pkl  cs229-project  pcs224n  tmp-cs229-project
cs229		      LLM-Pruner     tmp


In [11]:
avg_importances = pd.read_pickle("./avg_importances.pkl")

In [12]:
len(avg_importances)

24

## Model

In [13]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [15]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [16]:
# tokenizer.decode(token_info.get_prefixes(top_tokens[1000][0], 9, 10)[0])

In [17]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Prune Model

In [18]:
from prunners import prune_mlps_individually
from importances import get_mlps

In [19]:
mlps = get_mlps(model)

In [20]:
len(mlps), len(avg_importances)

(24, 24)

In [21]:
avg_importances = dict(zip(mlps, avg_importances))

In [22]:
prune_mlps_individually(avg_importances, 0.5)

In [23]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

## Replace model modules

In [24]:
from experts import Experts, EmbeddingTokenIdxTracker, mark_only_adapters_as_trainable, prepare_as_if_peft_model, prepare_model_for_gradient_checkpointing
from importances import get_mlps
from post_training import get_lora_config, get_training_arguments



In [25]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [26]:
training_arguments = prepare_as_if_peft_model(model, training_arguments, lora_config)

In [27]:
embed_tokens_new = EmbeddingTokenIdxTracker(model.get_submodule("model").get_submodule("embed_tokens"))

In [28]:
def get_layers(model):
    return model.get_submodule("model").get_submodule("layers")

In [29]:
layers = get_layers(model)

In [30]:
model.get_submodule("model").embed_tokens = embed_tokens_new

In [31]:
for i, layer in enumerate(layers):
    layer.mlp = Experts(
        model,
        layer.mlp,
        lora_config,
        i,
        embed_tokens_new.idx_tracker,
        layer.mlp.config,
    )

In [32]:
mark_only_adapters_as_trainable(model, lora_config)

In [33]:
prepare_model_for_gradient_checkpointing(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (cluster_router): ClusterRouter()
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=4096, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)


In [34]:
examples = ["hi this is an example", "hi this is an example"]

In [35]:
examples = torch.tensor(tokenizer.encode(examples)).view(-1, 5)

In [36]:
model.cuda()

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): EmbeddingTokenIdxTracker(
      (embed): Embedding(51200, 2048)
    )
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): Experts(
          (activation_fn): NewGELUActivation()
          (cluster_router): ClusterRouter()
          (experts_fc1): ModuleList(
            (0-7): 8 x lora.Linear(
              (base_layer): Linear(in_features=2048, out_features=4096, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)


In [37]:
# _ = model(examples.cuda())

## Train Model

In [38]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer
from peft import LoraConfig
import transformers
from trl import SFTTrainer

In [39]:
dataset = get_baseline_dataset()

reading pickle


In [40]:
training_arguments.save_strategy="no"

In [41]:
# Setup model for training
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Setup tokenizer for trainign
tokenizer.pad_token = tokenizer.eos_token

train_data, eval_data = dataset["train"], dataset["test"]
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    # peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [42]:
trainer.evaluate()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 4.6767449378967285,
 'eval_runtime': 177.2892,
 'eval_samples_per_second': 11.281,
 'eval_steps_per_second': 1.41}

In [43]:
trainer.train()

Step,Training Loss,Validation Loss
100,3.7194,3.650269
200,3.5084,3.491539
300,3.4976,3.446306
400,3.3947,3.422544
500,3.3575,3.407472
600,3.4357,3.397186
700,3.3783,3.389771
800,3.4381,3.383795
900,3.3453,3.380591
1000,3.3377,3.378289


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
trainer_state = trainer.state
pd.DataFrame(trainer_state.log_history).dropna(subset = ["eval_loss"])

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
11,,,0.12,100,3.650269,240.8597,8.304,1.038,,,,,
22,,,0.24,200,3.491539,243.8529,8.202,1.025,,,,,
33,,,0.36,300,3.446306,242.4336,8.25,1.031,,,,,
44,,,0.48,400,3.422544,242.527,8.247,1.031,,,,,
55,,,0.6,500,3.407472,242.0004,8.264,1.033,,,,,
66,,,0.72,600,3.397186,241.7342,8.274,1.034,,,,,
77,,,0.84,700,3.389771,241.4236,8.284,1.036,,,,,
88,,,0.96,800,3.383795,241.3092,8.288,1.036,,,,,
99,,,1.08,900,3.380591,241.6424,8.277,1.035,,,,,
110,,,1.2,1000,3.378289,240.9427,8.301,1.038,,,,,


In [45]:
pd.to_pickle(trainer.state, "./tmp/trainer_state_0.5ratio.pkl")

### Evaluation

In [46]:
from evaluation import evaluate_on_nlp_tasks

In [47]:
model.cuda();

In [48]:
model.eval();

In [49]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|▉| 2996/3000 [19:48<00:0IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [50]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.38,
  'acc_norm,none': 0.4666666666666667,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.7133333333333334,
  'acc_norm,none': 0.6966666666666667,
  'alias': 'piqa'},
 'boolq': {'acc,none': 0.6466666666666666, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.58, 'alias': 'winogrande'}}

In [51]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
 30%|▎| 2998/10000 [20:20<45:IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [52]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.377,
  'acc_norm,none': 0.489,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.717, 'acc_norm,none': 0.7, 'alias': 'piqa'},
 'boolq': {'acc,none': 0.646, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.588, 'alias': 'winogrande'}}

## Save

In [53]:
model.cpu();

In [54]:
torch.save(model.state_dict(), "./tmp/model_state_dict_0.5ratio")