In [1]:
%pip install -U transformers datasets accelerate peft bitsandbytes trl
%pip install -U python-dotenv
%pip install unsloth

Collecting torch>=1.10.0 (from accelerate)
  Using cached torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Using cached torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl (906.4 MB)
Installing collected packages: torch
Successfully installed torch-2.5.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

from datasets import Dataset
from unsloth import FastLanguageModel

from trl import SFTTrainer

from transformers import (
    TrainingArguments,
    Trainer
)
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
def set_seed(seed):
    """Set seed for reproducibility"""
    # Set seed for Python's built-in random module
    random.seed(seed)

    # Set seed for numpy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # disable to ensure reproducibility

def load_tsv_dataset(file_path):
    """
    Load the TSV file containing reviews and responses.
    """
    df = pd.read_csv(file_path, sep="\t")[:256]
    df = df.drop(columns=["ASIN"]) # ignore ID
    return Dataset.from_pandas(df)


In [4]:
set_seed(42)
dataset = load_tsv_dataset("./data/final/baseline.tsv")

In [5]:
model_name = "unsloth/Qwen2.5-1.5B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = model_name,
    # max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = False,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.11.11: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 2070 SUPER. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [7]:
# Apparently needed according to https://colab.research.google.com/drive/1Kose-ucXO1IBaZq5BvbwWieuubP7hxvQ?usp=sharing#scrollTo=QmUBVEnvCDJv
EOS_TOKEN = tokenizer.eos_token
SYSTEM_PROMPT = (
    "You are a helpful assistant for a business. "
    + "You are given a set of Amazon reviews for a given item, one for each rating out of 5, "
    + "and tasked with providing actionable feedback to help improve this item. "
    + "Please format your response into concise sentences, one for each actionable feedback. "
    + "Place each feedback on a bulletpoint."
)
def preprocess_function(example):
    # inputs = tokenizer(example["reviews"], padding="max_length", truncation=True)
    # labels = tokenizer(example["feedback"], padding="max_length", truncation=True)
    # inputs["labels"] = labels["input_ids"]

    conversation = [
        {
            "role": "system",
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": example["reviews"]
        }
    ]

    example["text"] = tokenizer.apply_chat_template(conversation, tokenize=False) + EOS_TOKEN
    example["labels"] = example["feedback"]
    return example
test = preprocess_function({"reviews": "Review 1: what the fuck", "feedback": "- fix shit\n-fix this shit too"})
print(test)
# dataset = dataset.map(preprocess_function, batched=True)


'<|im_start|>system\nYou are a useful assistant.<|im_end|>\n<|im_start|>user\nWrite me a poem.<|im_end|>\n'

In [21]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    bf16=True,
    weight_decay=0.01,
    save_strategy="epoch",
    save_steps=1000,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=100,
    report_to="tensorboard",
    remove_unused_columns=True,
    warmup_steps=10
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)



In [25]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 GiB. GPU 0 has a total capacity of 8.00 GiB of which 3.50 GiB is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 3.26 GiB is allocated by PyTorch, and 188.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)