In [None]:
#https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face
import torch # for tensor computation
import pandas as pd # for data manipulation
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import transformers # for model training
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [None]:
# ----------------------------------------- RUN ON WINDOWS ------------------------------------
#quantization config - to help model use up less RAM
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float16 #not sure what this does - search up
)

# loading in the model and tokenizer
model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
   model_name, device_map="cuda:0", quantization_config=bnb_config
)

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
# ----------------------------------------- RUN ON MAC ------------------------------------
# Mac-friendly dtype
torch_dtype = torch.bfloat16 if torch.backends.mps.is_available() else torch.float32

# Model and tokenizer
model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with offloading + safetensors
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch_dtype,
    device_map="auto",
    offload_folder="offload",  # create a folder to store offloaded weights
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.85s/it]
Some parameters are on the meta device because they were offloaded to the disk.


In [None]:
#Running on GPU
model = model.to("cuda:0")

#checking how much RAM our model needs
print("RAM needed:", model.get_memory_footprint()/1e6)

You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [None]:
# Function to generate a prompt for the model - this just formats a nice prompt for the model to use
#ignore "notation" it will be removed in the future
def generate_prompt(speechtotext, notation=None, eos_token="</s>"):
  instruction = "Convert this string into SAN chess notation:\n"
  input = f"{speechtotext}\n"
  notation = f"Chess Notation: {notation + ' ' + eos_token if notation else ''} "
  prompt = (" ").join([instruction, input, notation])
  return prompt

In [None]:
# Example of generating a prompt from training data
dataset = load_dataset('csv',data_files={'train':"Train_Data.csv",'test':"Test_Data.csv"})
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
# LoRA configuration to decompose delta W into smaller matrices
lora_config = LoraConfig(
        r=8, #rank
        lora_alpha=8, # alpha value
        lora_dropout=0.1,
        target_modules=["q_proj","k_proj","v_proj","o_proj"], #not sure how this part works??
        bias="none",
        task_type="CAUSAL_LM",
    )

In [None]:
# Adjust the tokenizer and prepare the model for LoRA training
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
# Define training arguments for fine-tuning the model
# The directory where the model and checkpoints will be saved
output_dir = "practise"

# The batch size for training per device (e.g., GPU or CPU)
# Larger batch sizes can speed up training but require more memory
per_device_train_batch_size = 4

# The number of steps to accumulate gradients before updating weights
# Gradient accumulation helps simulate larger batch sizes without increasing memory usage
gradient_accumulation_steps = 4

# The batch size for evaluation per device
# Determines how many samples are processed during evaluation at a time
per_device_eval_batch_size = 4

# The number of steps to accumulate evaluation results
# Useful for reducing memory usage during evaluation
eval_accumulation_steps = 4

# The optimizer to use for training
# "adamw_torch" is a variant of Adam optimizer with weight decay, commonly used in deep learning
optim = "adamw_torch"

# The number of steps between saving checkpoints
# Checkpoints allow resuming training and saving intermediate progress
save_steps = 10

# The number of steps between logging training metrics
# Logging helps monitor the training process and identify issues
logging_steps = 10

# The learning rate for the optimizer
# Controls the step size for updating model weights; a critical hyperparameter for training
learning_rate = 5e-4

# The maximum gradient norm for gradient clipping
# Gradient clipping prevents exploding gradients during training
max_grad_norm = 0.3

# The total number of training steps
# Determines how long the training process will run
max_steps = 1

# The ratio of total steps used for learning rate warmup
# Warmup helps stabilize training by gradually increasing the learning rate
warmup_ratio = 0.03

# The evaluation strategy (e.g., evaluate every few steps)
# Determines when evaluation is performed during training
evaluation_strategy = "steps"

# The learning rate scheduler type
# Controls how the learning rate changes during training; "constant" keeps it fixed
lr_scheduler_type = "constant"

"""
Fine-tuning in general works by taking a pre-trained model and adapting it to a specific task or dataset.
Pre-trained models, such as large language models, are trained on massive datasets to learn general features
and patterns in the data. Fine-tuning leverages this pre-trained knowledge and adjusts the model weights
to perform well on a smaller, task-specific dataset.

The process of fine-tuning typically involves:
1. Loading a pre-trained model: A model that has already been trained on a large dataset is used as the starting point.
2. Preparing the dataset: The task-specific dataset is preprocessed and formatted to match the model's input requirements.
3. Configuring training parameters: Hyperparameters such as learning rate, batch size, and optimizer are set to control the training process.
4. Training the model: The model is trained on the task-specific dataset, updating its weights to minimize the loss function.
5. Evaluating the model: The model's performance is evaluated on a validation or test dataset to ensure it generalizes well.
6. Saving the fine-tuned model: The final model is saved for inference or further fine-tuning.

Fine-tuning is a powerful technique because it allows leveraging the knowledge of large pre-trained models
while adapting them to specific tasks with relatively small datasets. This reduces the computational cost
and time required compared to training a model from scratch.
"""

training_args = transformers.TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )



In [None]:
# Function to format training prompts
def formatting_func(prompt):
  output = []

  for d in zip(prompt["speechtotext"]):
    op = generate_prompt(d)
    output.append(op)

  return output

In [None]:
# Set up the SFTTrainer with the model, datasets, and training arguments
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    formatting_func=formatting_func,
    tokenizer=tokenizer,
    args=training_args
)

  trainer = SFTTrainer(
Applying formatting function to train dataset: 100%|██████████| 448/448 [00:00<00:00, 8055.49 examples/s]
Converting train dataset to ChatML: 100%|██████████| 448/448 [00:00<00:00, 8951.60 examples/s]
Applying chat template to train dataset: 100%|██████████| 448/448 [00:00<00:00, 11173.30 examples/s]
Tokenizing train dataset: 100%|██████████| 448/448 [00:00<00:00, 2786.91 examples/s]
Truncating train dataset: 100%|██████████| 448/448 [00:00<00:00, 4928.97 examples/s]
Applying formatting function to eval dataset: 100%|██████████| 100/100 [00:00<00:00, 3585.37 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 100/100 [00:00<00:00, 2302.27 examples/s]
Applying chat template to eval dataset: 100%|██████████| 100/100 [00:00<00:00, 2026.15 examples/s]
Tokenizing eval dataset: 100%|██████████| 100/100 [00:00<00:00, 1699.17 examples/s]
Truncating eval dataset: 100%|██████████| 100/100 [00:00<00:00, 2048.15 examples/s]
No label_names provided for model clas

In [None]:
trainer.train()
trainer.save_model(f"{output_dir}/final")

from peft import PeftModel

# Load the fine-tuned model for inference
peft_model_id = "practise/checkpoint-10"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss




In [None]:
# Generate a prompt for the model
input_prompt = generate_prompt("Night to F3")
input_tokens = tokenizer(input_prompt, return_tensors="pt",padding=True, truncation=True, return_attention_mask=True)["input_ids"].to("cuda")

# Generate output using the model

with torch.amp.autocast("cuda"):
  generation_output = peft_model.generate(
      input_ids=input_tokens,
      max_new_tokens=100,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )

op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Convert this string into SAN chess notation:
 Night to F3
 Chess Notation:  1.f4 Nc6 2.e5 Ne7 3.Nf3 d6 4.Bb5 c5 5.d4 Nd8 6.0-0 Bg4 7.h3 Bh5 8.Qa4+ Qd7 9.Bg5 O-O 10.Rac1 e6 11.Be3 f5 12.exf5 gxf5 1
