# Finetuning Lamma 3.0

In [1]:
!pip install accelerate peft bitsandbytes transformers trl==0.11 wandb torch python-dotenv pyyaml



In [2]:
!pip install -U datasets



In [3]:
!pip install accelerate==0.27.2



In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    EarlyStoppingCallback, 
    IntervalStrategy
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from datasets import load_dataset
from dotenv import load_dotenv
import os
import yaml
import torch
import os
from huggingface_hub import login
import os
import wandb

load_dotenv("config.env")

with open('config.yaml') as file:
    config= yaml.safe_load(file)
    #print(config['hugginfaces']['model_name'])

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
print("PyTorch version:", torch.__version__)
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available.")

login(token='HUGGINGFACE_TOKEN')
os.environ["WANDB_PROJECT"] = "finetuning"
os.environ["WANDB_NAME"]= "finetuning-100"
os.environ["WANDB_API_KEY"] = "WANDB_TOKEN"
wandb.login()
wandb.init(dir=config['cache_dir'])

2025-03-03 23:36:53.557295: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-03 23:36:53.572905: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-03 23:36:53.577839: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-03 23:36:53.588908: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[34m[1mwandb[0m: Using wandb-cor

PyTorch version: 2.3.1.post300
CUDA version: 12.0


[34m[1mwandb[0m: Currently logged in as: [33mlpodo[0m ([33mdeepvizlab[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
def build_dataset(dataset):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """

    def add_eos_token(sample):
        sample['text'] = '<s>' + sample['text']+ '</s>'
        return sample

    dataset = dataset.map(add_eos_token)
    return dataset

# Training

In [6]:
#load the dataset from hugging face
dataset = load_dataset("LucaPodo/newton-dataset-v1", revision="0.0.2",download_mode="force_redownload", cache_dir="./tmp" )

(…)-00000-of-00001-653f5eb8e3f7e385.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

(…)-00000-of-00001-51e5da44a405016d.parquet:   0%|          | 0.00/349k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12570 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2455 [00:00<?, ? examples/s]

In [7]:
train = build_dataset(dataset['train'])
test = build_dataset(dataset['test'])

Map:   0%|          | 0/12570 [00:00<?, ? examples/s]

Map:   0%|          | 0/2455 [00:00<?, ? examples/s]

In [8]:
print(train[0])

{'text': "<s>Medium @ Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n### Instruction:\nBar chart x axis product name y axis how many product name , rank by the Y-axis in desc .\n### Input:\n[('product_id', 'numeric'), ('product_type_code', 'categorical'), ('product_name', 'categorical'), ('product_price', 'categorical')]\n### Response:\nmark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc\n</s>"}


In [9]:
def extract_hardness(sample):
    sample['hardness'] = sample['text'].split("@")[0].strip()
    text = sample['text'].split('@')[1]
    sample['text'] = '<s>' + text
    return sample

def clean_training_sample(sample):
    # Extract the hardness level
    sample['hardness'] = sample['text'].split("@")[0].strip()
    # Extract the main text after '@'
    text = sample['text'].split('@')[1]
    # Keep the instruction and response parts
    instruction_part = text.split("### Instruction:")[1].split("### Input:")[0].strip()
    input_part = text.split("### Input:")[1].split("### Response:")[0].strip()
    thought_process = "1. **Understand the data**: Identify numerical and categorical columns from the input.\n 2. **Determine the chart type**: Choose an appropriate mark (`bar`, `line`, `point`, `arc`) based on the instruction and data type.\n 3. **Define encoding**: Assign x-axis, y-axis, aggregation function (if applicable), and any color mapping.\n 4. **Identify transformations**: Determine whether filtering, binning, grouping, sorting, or top-k selection is required.\n 5. **Flatten into a Vega-Zero specification**: Convert the reasoning into the keyword-based format required by Vega-Zero.\n"
    response_part = text.split("### Response:")[1].strip()
    # Rebuild the cleaned text
    intro = "Generate a Vega-Zero unit specification based on the given dataset and visualization requirements. Identify the appropriate chart type, relevant data columns, encoding choices, and necessary transformations. Vega-Zero keeps most of the keywords of the Vega-Lite about the mapping between visual encoding channels and (transformed) data variables. It ﬂattens a JSON object into a sequence of keywords by removing structure-aware symbols such as brackets, colons, and quotation marks. Formally, a unit speciﬁcation in Vega-Zero is a four tuple (similar to Vega-Lite but with each tuple being a sequence) as: unit = (mark, data, encoding, transform) Naturally, as a simpliﬁcation of Vega-Lite: mark denotes the chart type, including bar, line, point (for scatter chart), arc (for pie chart); data speciﬁes the source data; encoding contains x/y-axis, aggregate function, and color based on which column; transform deﬁnes some data transformation functions: ﬁlter, bin, group, sort, and top-k."
    sample['text'] = f"<s>### Instruction:\n{instruction_part}\n### Input:\n{input_part}\n###Thought Process:\n{thought_process}\n### Response:\n{response_part}</s>"
    return sample

train = train.map(clean_training_sample, batched=False)
test = test.map(clean_training_sample, batched=False)

print(train[0])

train = train.filter(lambda ds: ds['hardness'] == "<s>Easy" or ds['hardness'] == "<s>Medium")
test = test.filter(lambda ds: ds['hardness'] == "<s>Extra Hard" or ds['hardness'] == "<s>Extra Hard")

print(train[0])

train = train.select(range(400)) 

Map:   0%|          | 0/12570 [00:00<?, ? examples/s]

Map:   0%|          | 0/2455 [00:00<?, ? examples/s]

{'text': "<s>### Instruction:\nBar chart x axis product name y axis how many product name , rank by the Y-axis in desc .\n### Input:\n[('product_id', 'numeric'), ('product_type_code', 'categorical'), ('product_name', 'categorical'), ('product_price', 'categorical')]\n###Thought Process:\n1. **Understand the data**: Identify numerical and categorical columns from the input.\n 2. **Determine the chart type**: Choose an appropriate mark (`bar`, `line`, `point`, `arc`) based on the instruction and data type.\n 3. **Define encoding**: Assign x-axis, y-axis, aggregation function (if applicable), and any color mapping.\n 4. **Identify transformations**: Determine whether filtering, binning, grouping, sorting, or top-k selection is required.\n 5. **Flatten into a Vega-Zero specification**: Convert the reasoning into the keyword-based format required by Vega-Zero.\n\n### Response:\nmark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc\n</s><

Filter:   0%|          | 0/12570 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2455 [00:00<?, ? examples/s]

{'text': "<s>### Instruction:\nBar chart x axis product name y axis how many product name , rank by the Y-axis in desc .\n### Input:\n[('product_id', 'numeric'), ('product_type_code', 'categorical'), ('product_name', 'categorical'), ('product_price', 'categorical')]\n###Thought Process:\n1. **Understand the data**: Identify numerical and categorical columns from the input.\n 2. **Determine the chart type**: Choose an appropriate mark (`bar`, `line`, `point`, `arc`) based on the instruction and data type.\n 3. **Define encoding**: Assign x-axis, y-axis, aggregation function (if applicable), and any color mapping.\n 4. **Identify transformations**: Determine whether filtering, binning, grouping, sorting, or top-k selection is required.\n 5. **Flatten into a Vega-Zero specification**: Convert the reasoning into the keyword-based format required by Vega-Zero.\n\n### Response:\nmark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc\n</s><

In [10]:
print(train[0])

{'text': "<s>### Instruction:\nBar chart x axis product name y axis how many product name , rank by the Y-axis in desc .\n### Input:\n[('product_id', 'numeric'), ('product_type_code', 'categorical'), ('product_name', 'categorical'), ('product_price', 'categorical')]\n###Thought Process:\n1. **Understand the data**: Identify numerical and categorical columns from the input.\n 2. **Determine the chart type**: Choose an appropriate mark (`bar`, `line`, `point`, `arc`) based on the instruction and data type.\n 3. **Define encoding**: Assign x-axis, y-axis, aggregation function (if applicable), and any color mapping.\n 4. **Identify transformations**: Determine whether filtering, binning, grouping, sorting, or top-k selection is required.\n 5. **Flatten into a Vega-Zero specification**: Convert the reasoning into the keyword-based format required by Vega-Zero.\n\n### Response:\nmark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc\n</s><

In [8]:
print(train[0]['text'])

<s>### Instruction:
Bar chart x axis product name y axis how many product name , rank by the Y-axis in desc .
### Input:
[('product_id', 'numeric'), ('product_type_code', 'categorical'), ('product_name', 'categorical'), ('product_price', 'categorical')]
### Response:
mark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc
</s></s>


In [10]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, config['bitsandbytes']['bnb_4bit_compute_dtype'])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=config['bitsandbytes']['use_4bit'],
    bnb_4bit_quant_type=config['bitsandbytes']['bnb_4bit_quant_type'],
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=config['bitsandbytes']['use_nested_quant'],
)


# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and config['bitsandbytes']['use_4bit']:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    config['fine_tuning']['model']['ref_model_name'],
    quantization_config=bnb_config,
    device_map = {"": 0}
)
model.config.use_cache = config['fine_tuning']['model']['use_cache']
model.config.pretraining_tp = config['fine_tuning']['model']['pretraining_tp']

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(config['fine_tuning']['model']['ref_model_name'],
                                          trust_remote_code=True,
                                          add_bos_token=False)#, add_eos_token=True, use_fast=True)
tokenizer.add_special_tokens(config['fine_tuning']['tokenizer']['padding_token'])
tokenizer.padding_side = config['fine_tuning']['tokenizer']['padding_side']

model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

print("BOS Token:", tokenizer.bos_token_id)
print("EOS Token:", tokenizer.eos_token_id)
print("PAD Token:", tokenizer.pad_token_id)
print(tokenizer.all_special_tokens)
sample_sentence = train['text'][0]

# Tokenize without special tokens
tokenized_output_no_special = tokenizer(sample_sentence, add_special_tokens=True)
print("Without special tokens:")
print("Tokenized Text:", [tokenizer.decode([x]) for x in tokenized_output_no_special["input_ids"]])
print("Token IDs:", tokenized_output_no_special["input_ids"])
print("Token IDs:", tokenized_output_no_special["attention_mask"])

BOS Token: 128000
EOS Token: 128001
PAD Token: 128256
['<|begin_of_text|>', '<|end_of_text|>', '[PAD]']
Without special tokens:
Tokenized Text: ['<|begin_of_text|>', '<s', '>', '###', ' Instruction', ':\n', 'Bar', ' chart', ' x', ' axis', ' product', ' name', ' y', ' axis', ' how', ' many', ' product', ' name', ',', ' rank', ' by', ' the', ' Y', '-axis', ' in', ' desc', '.\n', '###', ' Input', ':\n', '[', "('", 'product', '_id', "',", " '", 'numeric', "'),", " ('", 'product', '_type', '_code', "',", " '", 'c', 'ategorical', "'),", " ('", 'product', '_name', "',", " '", 'c', 'ategorical', "'),", " ('", 'product', '_price', "',", " '", 'c', 'ategorical', "')]\n", '###', ' Response', ':\n', 'mark', ' bar', ' data', ' products', ' encoding', ' x', ' product', '_name', ' y', ' aggregate', ' count', ' product', '_name', ' transform', ' group', ' x', ' sort', ' y', ' desc', '\n', '</', 's', '></', 's', '>']
Token IDs: [128000, 45147, 29, 14711, 30151, 512, 3511, 9676, 865, 8183, 2027, 836, 37

In [12]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=config['lora']['lora_alpha'],
    lora_dropout=config['lora']['lora_dropout'],
    r=config['lora']['lora_r'],
    bias=config['lora']['bias'],
    task_type=config['lora']['task_type'],
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=config['fine_tuning']['model']['output_dir'],
    num_train_epochs= config['fine_tuning']['hyperparameters']['num_train_epochs'],
    per_device_train_batch_size= config['fine_tuning']['hyperparameters']['per_device_train_batch_size'] ,
    gradient_accumulation_steps= config['fine_tuning']['hyperparameters']['gradient_accumulation_steps'] ,
    optim=config['fine_tuning']['hyperparameters']['optim'],
    save_steps=config['fine_tuning']['hyperparameters']['save_steps'],
    logging_steps=config['fine_tuning']['hyperparameters']['logging_steps'],
    learning_rate=config['fine_tuning']['hyperparameters']['learning_rate'],
    weight_decay=config['fine_tuning']['hyperparameters']['weight_decay'],
    fp16=config['fine_tuning']['hyperparameters']['fp16'],
    bf16=config['fine_tuning']['hyperparameters']['bf16'],
    max_grad_norm=config['fine_tuning']['hyperparameters']['max_grad_norm'],
    max_steps=config['fine_tuning']['hyperparameters']['max_steps'],
    warmup_ratio=config['fine_tuning']['hyperparameters']['warmup_ratio'],
    group_by_length=config['fine_tuning']['hyperparameters']['group_by_length'],
    lr_scheduler_type=config['fine_tuning']['hyperparameters']['lr_scheduler_type'],
    report_to=config['fine_tuning']['hyperparameters']['report_to'],
    load_best_model_at_end = True,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 50,
    metric_for_best_model='eval_loss'
)

print('1')

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset = test,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=200,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=config['fine_tuning']['sft']['packing'],
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
print('2')
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(config['fine_tuning']['model']['output_dir'])
trainer.tokenizer.save_pretrained(config['fine_tuning']['tokenizer']['output_dir'])

1



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]



2




Step,Training Loss,Validation Loss
50,1.0902,1.107151
100,0.9992,0.99917




('./models/larger-tokenizer/tokenizer_config.json',
 './models/larger-tokenizer/special_tokens_map.json',
 './models/larger-tokenizer/tokenizer.json')

In [13]:
#dataset_location= config['hugginfaces']['dataset']['name']
#test = load_dataset(config['hugginfaces']['dataset']['name'],
                    #split="test", revision="4.0.4", cache_dir = "./tmp")
pipe = pipeline(task="text-generation", model=trainer.model, tokenizer=trainer.tokenizer, max_length=400)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', '

# Push to the hub

In [2]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    config['fine_tuning']['model']['ref_model_name'],
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(config['fine_tuning']['tokenizer']['output_dir'], trust_remote_code=True)

tokenizer.padding_side = config['fine_tuning']['tokenizer']['padding_side']
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.pad_token_id = tokenizer.pad_token_id

model = PeftModel.from_pretrained(base_model, config['fine_tuning']['model']['output_dir'])
model = model.merge_and_unload()

In [4]:
# quando si modifica il modello si deve successivamente rifare il resize


print("BOS Token:", tokenizer.bos_token_id)
print("EOS Token:", tokenizer.eos_token_id)
print("PAD Token:", tokenizer.pad_token_id)
print(tokenizer.all_special_tokens)
assert model.config.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

BOS Token: 128000
EOS Token: 128001
PAD Token: 128256
['<|begin_of_text|>', '<|end_of_text|>', '[PAD]']


In [5]:
model.save_pretrained("./models/200-easy-full", save_embedding_layers=True)
tokenizer.save_pretrained("./models/200-easy-full-tokenizer")

('./models/200-easy-full-tokenizer/tokenizer_config.json',
 './models/200-easy-full-tokenizer/special_tokens_map.json',
 './models/200-easy-full-tokenizer/tokenizer.json')

In [2]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    "./models/200-easy-full",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("./models/200-easy-full-tokenizer", trust_remote_code=True)

tokenizer.padding_side = config['fine_tuning']['tokenizer']['padding_side']
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.pad_token_id = tokenizer.pad_token_id

In [4]:
# login(token=os.getenv('hf_write_token'))
# model.push_to_hub("DeepvizLab/newton-7b-full")
# tokenizer.push_to_hub("DeepvizLab/newton-7b-full")

In [5]:
dataset_location= "LucaPodo/newton-dataset-v1"
test = load_dataset(dataset_location, split="test", revision="4.0.4", cache_dir = "./tmp")
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=400, eos_token_id=tokenizer.eos_token_id)

i = 100
prompt = test[i]['text'].split("### Response:")[0] + "### Response:\nmark"
prompt = "### Instruction:" + prompt.split('### Instruction:')[1]
print(prompt + '\n')
groundtruth = test[i]['text'].split("### Response:")[1].strip()
print(groundtruth + '\n')

Using the latest cached version of the dataset since LucaPodo/newton-dataset-v1 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at tmp/LucaPodo___newton-dataset-v1/default/0.0.0/4bbe690926a2757f8f60bdff61f63b9558fda168 (last modified on Thu Feb 20 23:26:11 2025).


### Instruction:
Find the number of trains starting from each origin Plot them as bar chart , and order in asc by the y-axis .
### Input:
[('id', 'numeric'), ('train_number', 'numeric'), ('name', 'categorical'), ('origin', 'categorical'), ('destination', 'categorical'), ('time', 'categorical'), ('interval', 'categorical')]
### Response:
mark

mark bar data train encoding x origin y aggregate count origin transform group x sort y asc



In [6]:
result = pipe(prompt)
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


### Instruction:
Find the number of trains starting from each origin Plot them as bar chart , and order in asc by the y-axis .
### Input:
[('id', 'numeric'), ('train_number', 'numeric'), ('name', 'categorical'), ('origin', 'categorical'), ('destination', 'categorical'), ('time', 'categorical'), ('interval', 'categorical')]
### Response:
mark bar data trains encoding x origin y aggregate count origin transform group x sort y asc
### Min Score:
0.95
