In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
import os
import pandas as pd
import torch
from dataset_preprocessing import TokenInfo
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import itertools
import pandas as pd
from tqdm import tqdm

## Model

In [4]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

In [6]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [7]:
# tokenizer.decode(token_info.get_prefixes(top_tokens[1000][0], 9, 10)[0])

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

## Train model

In [9]:
from peft import LoraConfig, PeftConfig
import transformers

In [10]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer



In [11]:
lora_config = get_lora_config()
training_arguments = get_training_arguments("./tmp")

In [12]:
training_arguments.save_steps = 400

In [13]:
model.cuda();

In [14]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [15]:
dataset = get_baseline_dataset()
train_data, eval_data = dataset["train"], dataset["test"]

reading pickle


In [16]:
tokenizer.pad_token = tokenizer.eos_token

In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    dataset_text_field="text",
    max_seq_length=1024, # tweak this
    # TODO: think harder about the datacollator
    # data_collator=transformers.DataCollatorForSeq2Seq(
    #     tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    # ),
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.evaluate()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.1712610721588135,
 'eval_runtime': 164.4209,
 'eval_samples_per_second': 12.164,
 'eval_steps_per_second': 1.52}

In [19]:
train_res = trainer.train()

Step,Training Loss,Validation Loss
100,3.0887,3.046098
200,3.0306,3.021794
300,3.0525,3.012671
400,2.972,3.007714
500,2.9532,3.003997
600,3.0325,3.00101
700,2.9856,2.999192
800,3.0495,2.997283
900,2.9764,2.996145
1000,2.9662,2.995509


In [20]:
pd.to_pickle(trainer.state, "./tmp/trainer_no_pruning.pkl")

In [21]:
trainer_state = trainer.state
pd.DataFrame(trainer_state.log_history).tail()

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
179,3.0123,2.298851e-06,1.96,1630,,,,,,,,,
180,3.0054,1.660281e-06,1.97,1640,,,,,,,,,
181,3.0347,1.021711e-06,1.98,1650,,,,,,,,,
182,3.0401,3.831418e-07,1.99,1660,,,,,,,,,
183,,,2.0,1666,,,,,27059.3756,3.696,0.062,3.967319e+17,3.017435


### Evaluation

In [22]:
from evaluation import evaluate_on_nlp_tasks

In [23]:
model.eval();

In [24]:
with torch.no_grad():
    eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=300)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|███████████| 3000/3000 [01:39<00:00, 30.20it/s]
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [25]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.4533333333333333,
  'acc_norm,none': 0.5666666666666667,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.75,
  'acc_norm,none': 0.7733333333333333,
  'alias': 'piqa'},
 'boolq': {'acc,none': 0.6566666666666666, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.7366666666666667, 'alias': 'winogrande'}}

In [26]:
eval_res = evaluate_on_nlp_tasks(model, tokenizer, limit=1000)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
100%|█████████| 10000/10000 [05:31<00:00, 30.17it/s]
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [27]:
eval_res["results"]

{'hellaswag': {'acc,none': 0.433,
  'acc_norm,none': 0.552,
  'alias': 'hellaswag'},
 'piqa': {'acc,none': 0.762, 'acc_norm,none': 0.768, 'alias': 'piqa'},
 'boolq': {'acc,none': 0.642, 'alias': 'boolq'},
 'winogrande': {'acc,none': 0.707, 'alias': 'winogrande'}}

## Save

In [28]:
model.cpu();

In [29]:
torch.save(model.state_dict(), "./tmp/model_no_pruning_state_dict")