# Capstone

In [1]:
# !pip install -q  bitsandbytes einops wandb

In [2]:
# !pip install --upgrade datasets

In [3]:
from datasets import load_dataset

dataset_name = "DKYoon/SlimPajama-6B"

In [4]:
# Print the first element of the new dataset
dataset_train = load_dataset(dataset_name, cache_dir="dataset/slimpajama6B", split="train", num_proc=4 )
print(dataset_train[0])

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5489000 [00:00<?, ? examples/s]

Setting num_proc from 4 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/9347 [00:00<?, ? examples/s]

Setting num_proc from 4 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/9346 [00:00<?, ? examples/s]

{'text': 'Want Tori to Coach You Too?\nTori\'s Health Step by Step coming soon.\nWin free copies, prizes, access to exclusive behind-the-scenes, free access to Coach Tori, and more.\nand receive a copy of Tori\'s Weekly Challenges. We\'ll also notify you of when Tori\'s Program becomes available.\nI\'ve been asked, even criticized, about adding a focus on nutrition to Desert. There\'s a reason why. I had poor nutritional examples growing up. Being confused on the issue of nutrition cost me a lot. I remember yo-yo\'ing a lot. The only time I even came close to being my desired weight was when I did high-intensity workouts daily. At one point, I was exercised about 6 hours a day. I was in multiple dance classes and a karate class, as well as another karate club that met for two hours three days a week. I also rode my bike to campus, and even added a one hour workout when I got home. I was still thirty pounds overweight. I can attest to the coined phrase "You cannot exercise away a bad di

#### Setup tokenizer

In [7]:
import torch
from transformers import  AutoTokenizer

model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Setup configuration and create model

In [8]:
from transformers import AutoModelForCausalLM, AutoConfig


config = AutoConfig.from_pretrained(
    model_name,
    vocab_size=len(tokenizer),
    #n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    trust_remote_code=True
)

model_phi2_scratch = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

model_phi2_scratch.config.use_cache = False

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [9]:
print(model_phi2_scratch)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(50295, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2560,),

#### Training

In [23]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 5000
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 20000
warmup_ratio = 0.03
logging_steps = 30
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    report_to='tensorboard',
    lr_scheduler_type=lr_scheduler_type,
)

In [18]:
from datasets import  DatasetDict

raw_datasets = DatasetDict(
    {
        "train": dataset_train.shuffle().select(range(50000)),
    }
)
context_length = 256

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [25]:
from transformers import Trainer, DataCollatorForLanguageModeling, DataCollator

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model_phi2_scratch,
    train_dataset=tokenized_datasets["train"],
    #dataset_text_field="text",
    #max_seq_length=max_seq_length,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=training_arguments,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [26]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

Step,Training Loss
30,8.0368
60,7.6415
90,7.5325
120,7.3519
150,7.205
180,7.0903
210,7.0811
240,6.9756
270,6.9795
300,6.8057
