In [1]:
%pip install -U transformers datasets evaluate trl --quiet
%pip install -U unsloth --quiet
%pip install -U python-dotenv --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch

from datasets import Dataset
from huggingface_hub import hf_hub_download

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import FastLanguageModel

import pandas as pd
import numpy as np
import random

from dotenv import load_dotenv
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
def set_seed(seed):
    """Set seed for reproducibility"""
    # Set seed for Python's built-in random module
    random.seed(seed)

    # Set seed for numpy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # disable to ensure reproducibility


def load_tsv_dataset(file_path):
    """
    Load the TSV file containing reviews and responses.
    """
    df = pd.read_csv(
        hf_hub_download(repo_id="RichardNooooh/AmazonFeedback",
                        filename=file_path,
                        repo_type="dataset"),
        sep="\t"
    )
    df = df.drop(columns=["ASIN"]) # ignore ID
    return Dataset.from_pandas(df)

load_dotenv("./src/.env")
HF_TOKEN = os.environ["HF_TOKEN"]

In [4]:
set_seed(42)
dataset = load_tsv_dataset("train/augmented_keywords.tsv")

augmented_keywords.tsv:   0%|          | 0.00/24.1M [00:00<?, ?B/s]

In [5]:
model_name = "unsloth/Qwen2.5-1.5B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    load_in_4bit=False
)

# LORA required since unsloth cannot do full parameter opt.
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2024.12.2: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Unsloth 2024.12.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [6]:
# Apparently needed according to https://colab.research.google.com/drive/1Kose-ucXO1IBaZq5BvbwWieuubP7hxvQ?usp=sharing#scrollTo=QmUBVEnvCDJv
SYSTEM_PROMPT = (
    "You are a helpful assistant for a business. "
    + "You are given a set of Amazon reviews for a given item, one for each rating out of 5, "
    + "and tasked with providing actionable feedback to help improve this item. "
    + "Please format your response into concise sentences, one for each actionable feedback. "
    + "Place each feedback on a bulletpoint."
)
def preprocess_function(examples):
    texts, labels = examples["text"], examples["labels"]

    formatted_data = []
    for text, label in zip(texts, labels):
      conversation = [
          {
              "role": "system",
              "content": SYSTEM_PROMPT
          },
          {
              "role": "user",
              "content": text
          },
          {
              "role": "assistant",
              "content": label
          }
      ]
      formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False)
      formatted_data.append(formatted_text)

    return {"text": formatted_data}

dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

Map:   0%|          | 0/7713 [00:00<?, ? examples/s]

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    bf16=True,
    weight_decay=0.01,
    save_strategy="epoch",
    save_steps=100,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=100,
    report_to="tensorboard",
    remove_unused_columns=True,
    warmup_steps=20,
    max_grad_norm=1.0,
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_num_proc = 2,
    max_seq_length = 2048,
    dataset_text_field = "text",
)



Map (num_proc=2):   0%|          | 0/6170 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1543 [00:00<?, ? examples/s]

In [8]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,170 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 3,850
 "-____-"     Number of trainable parameters = 73,859,072


Epoch,Training Loss,Validation Loss
0,1.895,1.863197
1,1.8403,1.833073
2,1.8152,1.818762
3,1.7995,1.81152
4,1.7831,1.806772
5,1.7701,1.804672
6,1.7702,1.803055
7,1.7576,1.802084
8,1.7545,1.801829
9,1.7452,1.801762


TrainOutput(global_step=3850, training_loss=1.8030097545276988, metrics={'train_runtime': 14954.5872, 'train_samples_per_second': 4.126, 'train_steps_per_second': 0.257, 'total_flos': 4.9222859297273856e+17, 'train_loss': 1.8030097545276988, 'epoch': 9.986390149060272})

In [9]:
output_repo = "RichardNooooh/Qwen2.5-1.5B-AF-Augmented-Keywords"

model.push_to_hub(output_repo, private=True, token=HF_TOKEN)
tokenizer.push_to_hub(output_repo, private=True, token=HF_TOKEN)

README.md:   0%|          | 0.00/584 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/295M [00:00<?, ?B/s]

Saved model to https://huggingface.co/RichardNooooh/Qwen2.5-1.5B-AF-Augmented-Keywords


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]