In [None]:
!pip install -q  torch peft bitsandbytes transformers trl accelerate sentencepiece cryptography wandb

In [None]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer
from cryptography.fernet import Fernet
from getpass import getpass
from huggingface_hub import notebook_login
import os
import wandb

In [None]:
# SETUP

DATA_NAME = 'Tom10117/simme'
PROJECT_NAME = 'messages'
RUN_NAME = 'a100'
MAX_SEQ_LENGTH = 200
BASE_MODEL_NAME = "meta-llama/Llama-3.1-8B"
REFINED_MODEL_NAME = f"Tom10117/{PROJECT_NAME}-{RUN_NAME}"

# HYPER-PARAMETERS

LORA_ALPHA = 64
LORA_R = 32
LORA_DROPOUT = 0.1
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
LR_SCHEDULER_TYPE = 'cosine'
WEIGHT_DECAY = 0.001
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# OTHER TRAINING CONFIG

STEPS = 100
SAVE_STEPS = 500
EVAL_STEPS = 1000

In [None]:
from huggingface_hub import login
login(token='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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
wandb_key = getpass("Enter Weights & Biases Key")
wandb.login(key="a382ecd760ea89ea2663fdeccd26ffb4cdff3b35", relogin=True)

Enter Weights & Biases Key··········


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"] = PROJECT_NAME

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"] = "false"

In [None]:
train_dataset = load_dataset(DATA_NAME, split="train")
test_dataset = load_dataset(DATA_NAME, split="test")

data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(data)
print(data['train'][100])
print(data['test'][100])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/375 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.66M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/300k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/22459 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1183 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 22459
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1183
    })
})
{'text': '<<SYS>>Write a realistic text message chat. Avoid repetition.<</SYS>>\n[INST]Write a chat between Tiến Dũng Nguyễn and Chu Diễm Quỳnh[/INST]\n### Chu Diễm Quỳnh: cái plan đấy ### Chu Diễm Quỳnh: tức là ### Tiến Dũng Nguyễn: bây h mình theo plan khác hả trời ### Tiến Dũng Nguyễn: what does it mean ### Tiến Dũng Nguyễn: but it starts from Aug 1 2023 ### Chu Diễm Quỳnh: nan ### Chu Diễm Quỳnh: Cuộc gọi video đã kết thúc. ### Tiến Dũng Nguyễn: Cuộc gọi video đã kết thúc. ### Tiến Dũng Nguyễn: Cuộc gọi video đã kết thúc. ### Chu Diễm Quỳnh: okay ### Tiến Dũng Nguyễn: i really want to...'}
{'text': '<<SYS>>Write a realistic text message chat. Avoid repetition.<</SYS>>\n[INST]Write a chat between Tiến Dũng Nguyễn and Phạm Khôi Nguyên[/INST]\n### Phạm Khôi Nguyên: Anh ma tay to ### Phạm Khôi Nguyên: Phim của anh

In [None]:
# Model and tokenizer names
base_model_name = BASE_MODEL_NAME
refined_model = REFINED_MODEL_NAME

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto"
)

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

In [None]:
# Training Params
train_params = TrainingArguments(
    output_dir=REFINED_MODEL_NAME,
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim="paged_adamw_32bit",
    save_steps=SAVE_STEPS,
    save_total_limit=10, # to avoid running out of disk space!
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb",
    run_name=RUN_NAME,
    push_to_hub=True,
    hub_model_id=REFINED_MODEL_NAME,
    hub_strategy="end",
    hub_private_repo=True
)

In [None]:
# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    max_seq_length=MAX_SEQ_LENGTH,
    args=train_params
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/22459 [00:00<?, ? examples/s]

In [None]:
fine_tuning.train()

fine_tuning.model.save_pretrained(refined_model)

In [None]:
fine_tuning.model.push_to_hub(
    repo_id=REFINED_MODEL_NAME,
    private=True,
    commit_message="Updating model and README",
)

IsADirectoryError: [Errno 21] Is a directory: 'Tom10117/messages-a100'