# imports

In [1]:
%pip install datasets transformers trl peft
%pip install accelerate

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/542.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (

In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer

# model config

In [3]:
config = AutoConfig.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
config.max_position_embeddings = 256
config.num_attention_heads = 8
config.num_hidden_layers = 3
config.tie_words_embeddings = True
config.hidden_size = 128 #1
config.intermediate_size = 512
config.num_attention_heads = 8 #2
config.num_key_value_heads = 8

required_length = config.hidden_size // (config.num_key_value_heads * 2)
config.rope_scaling['long_factor'] = config.rope_scaling['long_factor'][:required_length]
config.rope_scaling['short_factor'] = config.rope_scaling['short_factor'][:required_length]



config.json:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model parameters

In [4]:
def count_model_params(model):
    total_params = sum(prams.numel() for prams in model.parameters())
    trainable_parms = sum(prams.numel() for prams in model.parameters() if prams.requires_grad)
    print(total_params, trainable_parms)

initialize model

In [5]:
my_model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
my_model.to('cuda')

modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 128, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-2): 3 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=128, out_features=128, bias=False)
          (qkv_proj): Linear(in_features=128, out_features=384, bias=False)
          (rotary_emb): Phi3SuScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=128, out_features=1024, bias=False)
          (down_proj): Linear(in_features=512, out_features=128, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=128, out_features=32064, b

print model config and parameters

In [6]:
print(my_model.config)
count_model_params(my_model)

Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "max_position_embeddings": 256,
  "model_type": "phi3",
  "num_attention_heads": 8,
  "num_hidden_layers": 3,
  "num_key_value_heads": 8,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "long_factor": [
      1.0299999713897705,
      1.0499999523162842,
      1.0499999523162842,
      1.0799999237060547,
      1.2299998998641968,
      1.2299998998641968,


tokenizer

In [7]:
t = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dataset

In [8]:
d = load_dataset("roneneldan/TinyStories")
print(d)

Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})


formating dataset

In [9]:
def formating_dataset(story):
    txt = f'tell me a story{story["text"]}'
    return {"text": txt}

# training time

In [10]:
args = TrainingArguments(
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    num_train_epochs=8,
    max_steps=1000,
    fp16=True,
    evaluation_strategy="steps",
    logging_steps=10,
    save_steps=1000,
    optim="paged_adamw_32bit",
    output_dir="pretrained-model",
    push_to_hub=False,
    report_to="none",
)



prepare dataset

In [11]:
train_d = d["train"].select([i for i in range(1000)])
eval_d = d["validation"].select([i for i in range(10)])
print(train_d)
print(eval_d)

Dataset({
    features: ['text'],
    num_rows: 1000
})
Dataset({
    features: ['text'],
    num_rows: 10
})


In [12]:
trainer = SFTTrainer(
    model=my_model,
    train_dataset=train_d,
    eval_dataset=eval_d,
    tokenizer=t,
    args=args,
    dataset_text_field="text",
    max_seq_length=256,
    formatting_func=formating_dataset,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [14]:
%pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [15]:
trainer.train()



Step,Training Loss,Validation Loss
10,10.3437,10.302851
20,10.2837,10.237806
30,10.2269,10.181909
40,10.1782,10.139096
50,10.1336,10.108229
60,10.1023,10.084946
70,10.0835,10.070227
80,10.0756,10.061911
90,10.0849,10.05886
100,10.0585,10.058359




TrainOutput(global_step=100, training_loss=10.157089157104492, metrics={'train_runtime': 14.674, 'train_samples_per_second': 13.63, 'train_steps_per_second': 6.815, 'total_flos': 1370897395200.0, 'train_loss': 10.157089157104492, 'epoch': 0.2})

In [16]:
trainer.save_model("my_model")