In [1]:
import gc
import os
if not os.path.exists('./.hf_cache'):
    os.makedirs('./.hf_cache')
os.environ["HF_HOME"] = './.hf_cache'

import torch
import wandb
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTConfig, SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
torch.cuda.empty_cache()

In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = './train-llama.ipynb'
os.environ["WANDB_PROJECT"] = 'llama-dft'
wandb.login(key='a61e442e6922af9f064f40ede9cd909e47a9a2a6')

[34m[1mwandb[0m: Currently logged in as: [33mcx9[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /global/home/users/chenxin0210/.netrc


True

In [4]:
torch.cuda.device_count()

1

In [5]:
# if torch.cuda.get_device_capability()[0] >= 8:
#     !pip install -qqq flash-attn
#     attn_implementation = "flash_attention_2"
#     torch_dtype = torch.bfloat16
# else:
attn_implementation = "eager"
torch_dtype = torch.float16


In [6]:
base_model = "meta-llama/Meta-Llama-3-8B"
# new_model = "Qlora-Llama-3-8B"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=128,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)


In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
    use_cache=False,
    torch_dtype=torch_dtype
)
# model.gradient_checkpointing_enable()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.77s/it]


In [8]:
# # Basic encoding/decoding
# text = "Hello, world!"
# encoded = tokenizer.encode(text)
# decoded = tokenizer.decode(encoded)
# print(f"Original: {text}")
# print(f"Encoded: {encoded}")
# print(f"Decoded: {decoded}")

# # Tokenization
# tokens = tokenizer.tokenize(text)
# print(f"Tokens: {tokens}")

In [9]:

model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

In [10]:
dataset = load_dataset("MaterialsAI/robocr_poscar")

In [11]:
def formatting_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"Instruction: {example['instruction'][i]}\nInput: {example['input'][i]}\nOutput: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

In [12]:
import re

def round_poscar_numbers(text, decimal_places=4):
    def round_number(match):
        return f"{float(match.group()):.{decimal_places}f}"

    # Pattern to match floating point numbers
    pattern = r'\d+\.\d+'

    # Round numbers in the text
    rounded_text = re.sub(pattern, round_number, text)

    
    # Remove leading/trailing whitespace from each line
    rounded_text = '\n'.join(' '.join(line.split()) for line in rounded_text.splitlines())

    return rounded_text

def format_chat_template(row):
    rounded_out = round_poscar_numbers(row["output"], decimal_places=4)
    row_json = [{"role": "system", "content": row["instruction"]}, {"role": "user", "content": row["input"]},
               {"role": "assistant", "content": rounded_out}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['train']['text'][13]

Map (num_proc=4): 100%|██████████| 1233/1233 [00:00<00:00, 1373.33 examples/s]
Map (num_proc=4): 100%|██████████| 153/153 [00:00<00:00, 252.85 examples/s]
Map (num_proc=4): 100%|██████████| 137/137 [00:00<00:00, 283.34 examples/s]


'<|im_start|>system\nGenerate the POSCAR file for the given crystal structure.<|im_end|>\n<|im_start|>user\nLiBi(PO₄)₂ crystallizes in the triclinic P1 space group. There are three inequivalent Li¹⁺ sites. In the first Li¹⁺ site, Li¹⁺ is bonded in a distorted hexagonal planar geometry to six O²⁻ atoms. There are a spread of Li-O bond distances ranging from 2.47-2.58 Å. In the second Li¹⁺ site, Li¹⁺ is bonded in a 6-coordinate geometry to six O²⁻ atoms. There are three shorter (1.97 Å) and three longer (2.44 Å) Li-O bond lengths. In the third Li¹⁺ site, Li¹⁺ is bonded in a 6-coordinate geometry to six O²⁻ atoms. There are a spread of Li-O bond distances ranging from 1.97-2.46 Å. There are three inequivalent Bi⁵⁺ sites. In the first Bi⁵⁺ site, Bi⁵⁺ is bonded to six O²⁻ atoms to form BiO₆ octahedra that share corners with six PO₄ tetrahedra. There are three shorter (2.17 Å) and three longer (2.18 Å) Bi-O bond lengths. In the second Bi⁵⁺ site, Bi⁵⁺ is bonded in a 6-coordinate geometry to s

In [13]:
from tqdm import tqdm

lens = []
for i in tqdm(range(len(dataset['train']['text']))):
    encoded = tokenizer.encode(dataset['train']['text'][i], add_special_tokens=True, max_length=None, truncation=True)
    lens.append(len(encoded))

for i in tqdm(range(len(dataset['validation']['text']))):
    encoded = tokenizer.encode(dataset['validation']['text'][i], add_special_tokens=True, max_length=None, truncation=True)
    lens.append(len(encoded))

for i in tqdm(range(len(dataset['test']['text']))):
    encoded = tokenizer.encode(dataset['test']['text'][i], add_special_tokens=True, max_length=None, truncation=True)
    lens.append(len(encoded))


  0%|          | 0/1233 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1233/1233 [00:10<00:00, 113.61it/s]
100%|██████████| 137/137 [00:00<00:00, 334.62it/s]
100%|██████████| 153/153 [00:00<00:00, 323.53it/s]


In [14]:
import numpy as np

In [15]:
np.max(lens), np.min(lens), np.quantile(lens, 0.25), np.quantile(lens, 0.50), np.quantile(lens, 0.75), np.quantile(lens, 0.85)

(12200, 181, 969.5, 1402.0, 2153.0, 2747.1000000000004)

In [19]:
import inspect
str(inspect.signature(SFTConfig))



In [16]:
# sft_config = SFTConfig(
#     learning_rate=8e-6,
#     lr_scheduler_type="linear",
#     gradient_checkpointing=True,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     gradient_accumulation_steps=4,
#     optim="paged_adamw_8bit",
#     num_train_epochs=1,
#     eval_strategy="steps",
#     eval_steps=0.2,
#     logging_steps=1,
#     warmup_steps=10,
#     report_to="wandb",
#     output_dir="tmp"
# )

training_arguments = TrainingArguments(
    output_dir='output/new_model',
    torch_empty_cache_steps=2,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    save_strategy='steps',
    save_steps=0.1,
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    group_by_length=True,
    report_to="wandb",
    run_name='finetune-1',
    fp16=True,  # Enable mixed precision
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':False}
)

#    gradient_accumulation_steps=12,

trainer = SFTTrainer(
    model=model,
    max_seq_length=2500,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    tokenizer=tokenizer,
    dataset_text_field='text'
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'torch_empty_cache_steps'

In [23]:
transformers.__version__

'4.42.4'

In [22]:
training_arguments = TrainingArguments(
    torch_empty_cache_steps=2
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'torch_empty_cache_steps'

In [17]:
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 