In [2]:
from huggingface_hub import login
login(token="")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\admin\.cache\huggingface\token
Login successful


In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))
from datasets import load_dataset
from datasets import Dataset
from datasets import DatasetDict
from collections.abc import Iterator
from transformers import TextIteratorStreamer
from threading import Thread

from utils import (
    llama2_chat_text_convert_train, 
    llama2_chat_text_convert_test, 
    print_trainable_parameters, 
    bnb_config,
    transformer_trainer, 
    sft_trainer,
    max_length,
    soft_prompt_config,
    hard_prompt_config
)



bin D:\CondaEnvs\LLM_Tuning\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll
Your GPU supports bfloat16: accelerate training with bf16=True


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


# load data

In [2]:
train_file = '../../data/data.csv'
test_file = '../../data/data.csv'

dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

train_dataset = dataset['train']
test_dataset = dataset['test']

In [3]:
train_dataset

Dataset({
    features: ['input_text', 'output_text'],
    num_rows: 299
})

In [4]:
test_dataset

Dataset({
    features: ['input_text', 'output_text'],
    num_rows: 299
})

# Modify data

In [5]:
train_dataset = train_dataset.map(lambda x: llama2_chat_text_convert_train(x, input_col="input_text", output_col="output_text"), 
                                  remove_columns=['input_text', 'output_text'])

In [6]:
train_dataset[0]

{'instruction': '<s>[INST] How much did my fleet idle last month? [/INST]',
 'text': '<s>[INST] How much did my fleet idle last month? [/INST] Idling-Idling duration </s></s>',
 'label': 'Idling-Idling duration </s></s>'}

In [7]:
test_dataset = test_dataset.map(lambda x: llama2_chat_text_convert_test(x, input_col="input_text"))


In [8]:
test_dataset[0]

{'input_text': 'How much did my fleet idle last month?',
 'output_text': 'Idling-Idling duration',
 'text': '<s>[INST] How much did my fleet idle last month? [/INST]'}

# model training

In [9]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    pipeline,
    logging,
)

In [10]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "llama-2-7b-xin"

## load base model

In [11]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             quantization_config=bnb_config, 
                                             device_map='auto', 
                                             trust_remote_code=True)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
print_trainable_parameters(model, soft_prompt_config)


trainable params: 409,600 || all params: 6,738,825,216 || trainable%: 0.006078210769252277


In [13]:
print_trainable_parameters(model, hard_prompt_config)


trainable params: 28,672 || all params: 6,738,444,288 || trainable%: 0.00042549880617192095


## this is to make sure after truncation, all data end with eos
## truncation is needed due to cuda oom

In [15]:
def contain_eos_filter_llama2(data_point, dataset_text_field="text"):
    # 2 is eos for llama2
    return tokenizer(data_point[dataset_text_field], padding=True, truncation=True, max_length=max_length)["input_ids"][-1] == tokenizer.eos_token_id

# Apply the filtering function
train_dataset = train_dataset.filter(contain_eos_filter_llama2)

In [16]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 299
})

In [17]:
# Set supervised fine-tuning parameters
trainer = transformer_trainer(model, hard_prompt_config, tokenizer, train_dataset, dataset_text_field="text", label_text_field=True, output_dir="llama2_results")

# Train model
trainer.train()



Map:   0%|          | 0/299 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss



KeyboardInterrupt



# make predictions

In [None]:
def run(prompt,
        temperature=0.1,
        top_p=0.95,
        top_k=50, 
        eos_token_id=None):
        inputs=tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')

        streamer = TextIteratorStreamer(tokenizer,
                                        timeout=10.,
                                        skip_prompt=True,
                                        skip_special_tokens=True)
        generate_kwargs = dict(
            inputs,
            streamer=streamer,
            max_new_tokens=96,
            do_sample=True,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            num_beams=1,
            eos_token_id=eos_token_id,
        )
        t = Thread(target=trainer.model.generate, kwargs=generate_kwargs)
        t.start()

        outputs = []
        for text in streamer:
            outputs.append(text)
            yield ''.join(outputs).lstrip()


In [None]:
def genearte_answer(prompt, eos_token_id=None):
    generator = run(prompt, eos_token_id=eos_token_id)
    previous_texts = ""
    for response in generator:
        print(response[len(previous_texts):], end='')
        previous_texts = response
    return previous_texts


In [None]:
end_token_ids = []
# for stuff, stuff_id in tokenizer.vocab.items():
#     if stuff.endswith(";"):
#         end_token_ids.append(stuff_id)

In [None]:
index = 80
print(test_dataset["text"][index])

In [None]:
print(test_dataset["output_text"][index])

In [None]:
response = genearte_answer(test_dataset["text"][index], eos_token_id=[tokenizer.eos_token_id]+end_token_ids)

# save adapter

In [None]:
trainer.model.save_pretrained(new_model)

# Merge weights and save

In [None]:
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import torch
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "llama-2-7b-spider"

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.save_pretrained('llama2-7b-merged-spider')

# load saved model

In [None]:
import sys
import os

sys.path.append(os.path.abspath('..'))

from utils import (
    progress_generation, 
    load_merged,
    load_base_and_adapter,
    bnb_config
)
from transformers import AutoTokenizer



In [2]:
model = load_merged("llama2-7b-merged-spider", bnb_config)

# model = load_base_and_adapter("meta-llama/Llama-2-7b-chat-hf", "llama-2-7b-spider", quantization_config=bnb_config)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# make prediction using loaded

In [7]:
end_token_ids = []
for stuff, stuff_id in tokenizer.vocab.items():
    if stuff.endswith(";"):
        end_token_ids.append(stuff_id)

In [11]:
index = 90
print(test_dataset["text"][index])

<s>[INST] Convert text to SQL:
[Schema (values)]: continents : ContId, Continent | countries : CountryId, CountryName, Continent;
[Column names (type)]: continents : ContId (number)| continents : Continent (text)| countries : CountryId (number)| countries : CountryName (text)| countries : Continent (number);
[Q]: For each continent, list its id, name, and how many countries it has? [/INST]


In [13]:
response = progress_generation(test_dataset["text"][index],
                    model,
                    tokenizer,
                    temperature=0.01,
                    top_p=0.95,
                    top_k=50,
                    max_new_tokens=96,
                    eos_token_id=[tokenizer.eos_token_id]+end_token_ids,
                    show=True)

SELECT T1.contid , T1.continent , count(*) FROM continents AS T1 JOIN countries AS T2 ON T1.contid = T2.continent GROUP BY T1.contid;