In [60]:
import torch
torch.cuda.is_available()

True

## Initialize the LLM

In [61]:
from unsloth import FastLanguageModel
# 本质上还是对transformers的包装，但是做了一些优化
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# 打开4bit量化加载
# 所谓量化，是让模型以更低的空间存储这些参数，但是实际计算的时候要反量化尽量用原本的数去计算

### Changing the model here is forbidden !

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-2-7b-bnb-4bit",    ### Do not change the model for any other models or quantization versions
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.9.7: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 4. Max memory: 23.559 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu129. CUDA: 8.6. CUDA Toolkit: 12.9. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Add LoRA adapters

In [62]:
################# TODO : Tweak the LoRA adapter hyperparameters here.  #####################

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, ### TODO : Choose any number > 0 ! Common values are 4, 8, 16, 32, 64, 128. Higher ranks allow more expressive power but also increase parameter count.
    lora_alpha = 16,  ### TODO : Choose any number > 0 ! Suggested 4, 8, 16, 32, 64, 128

# 这里在用unsloth的方法给大模型挂上LoRA适配器


################# TODO  ####################################################################
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [63]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",  ### Use llama-3.1 template for better performance here
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# 这里直接套用了模板，全套都是hugging face的东西
# 改之前的样子：
# messages = [
#   {"role":"user", "content":"Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Write a short story in third person narration about a protagonist who has to make an important career decision."},
#   {"role":"assistant", "content":"John was at a crossroads in his life. He had just graduated college ..."}
# ]
# 改之后的样子：
# <|start_header_id|>user<|end_header_id|>

# Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Write a short story in third person narration about a protagonist who has to make an important career decision.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# John was at a crossroads in his life. He had just graduated college and was now facing the big decision of what career to pursue. After much deliberation, he decided that he wanted to be an accountant and help the financially disadvantaged. He had always been good with numbers and enjoyed seeing the tangible results of his work. 
# John enrolled in accounting courses and initially found it quite challenging. He had to learn multiple systems and regulations quickly, but he worked hard and eventually excelled in his studies. After a few years,<|eot_id|>

###

# Dataset Preperation

In [64]:
from datasets import load_dataset, Dataset, load_from_disk

# Load the dataset from Hugging Face
dataset = load_from_disk("./fastchat_alpaca_52k")
print(dataset)
print(dataset[0]["conversations"])
# {
#     "id" : ...,
#     "conversations": [
#         {
#             'content': '...',
#             'role':'user'
#         },
#         {
#             'content': '...',
#             'role':'assistent'
#         }
#     ]
# }

Dataset({
    features: ['id', 'conversations', 'score'],
    num_rows: 52002
})
[{'content': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Give three tips for staying healthy.', 'role': 'user'}, {'content': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'role': 'assistant'}]


In [65]:
# ---------------------------
# Add a "text" field to each example
# ---------------------------
# This function extracts the first assistant message from the conversation
def add_text_field(example):
    # Extract the first message where role == 'assistant'
    assistant_texts = [msg["content"] for msg in example["conversations"] if msg["role"] == "assistant"] # example['conversation']是两个字典的列表
    text = assistant_texts[0] if assistant_texts else ""
    return {"text": text}

# Map the function over the dataset to add the "text" column.
dataset = dataset.map(add_text_field) # 这里的map是新增列

# Print the dataset structure to confirm the new feature.
print(dataset)


Dataset({
    features: ['id', 'conversations', 'score', 'text'],
    num_rows: 52002
})


In [66]:
import json
print(json.dumps(dataset[0],indent = 2,ensure_ascii = False))

{
  "id": "identity_0",
  "conversations": [
    {
      "content": "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Give three tips for staying healthy.",
      "role": "user"
    },
    {
      "content": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.",
      "role": "assistant"
    }
  ],
  "score": 4.5,
  "text": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
}


In [67]:
# ---------------------------
#################### TODO : Define a helper function for computing conversation length ###############

# The default "conversation length" here refers to the length of the input (human) and output (gpt), you can modify it at your will

def compute_conversation_length(example):
    # Compute total word count across all messages in the 'conversations' field
    return sum(len(message["content"].split()) for message in example["conversations"]) # 这里是把问答的词汇量都加起来


#################### TODO ############################################################################

# ---------------------------
# Simple Sorting Method  (Default)
# ---------------------------
# Sort the dataset from shortest to longest conversation (by word count)
sorted_dataset_simple_list = sorted(dataset, key=compute_conversation_length, reverse=True)

# Convert back to a Dataset object
sorted_dataset_simple = Dataset.from_list(sorted_dataset_simple_list)

print("\nTop examples sorted by simple conversation length:")
for entry in sorted_dataset_simple.select(range(5)):
    print(f"ID: {entry['id']}, Conversation Length: {compute_conversation_length(entry)}")
# ---------------------------



############## Advanced Sorting Method (TODO : Modify the sorting key ##################
# ---------------------------
# Default : Sorting based on Combining conversation length with the 'score' field using a weighted sum.
# Here, we multiply the score by 10 and add it to the conversation length.
def advanced_sort_key(example):
    conversation_len = compute_conversation_length(example)
    score = example["score"]
    return 2e-3 * conversation_len + score * 1

####################################### TODO ###########################################

sorted_dataset_advanced_list = sorted(dataset, key=advanced_sort_key, reverse=True)
# Convert back to a Dataset object
sorted_dataset_advanced = Dataset.from_list(sorted_dataset_advanced_list)

print("\nTop examples sorted by advanced key (combination of conversation length and score):")
for entry in sorted_dataset_advanced.select(range(5)):
    print(f"ID: {entry['id']}, Advanced Key Value: {advanced_sort_key(entry)}")


Top examples sorted by simple conversation length:
ID: identity_45289, Conversation Length: 759
ID: identity_6285, Conversation Length: 670
ID: identity_15102, Conversation Length: 622
ID: identity_18853, Conversation Length: 567
ID: identity_15908, Conversation Length: 558

Top examples sorted by advanced key (combination of conversation length and score):
ID: identity_45289, Advanced Key Value: 6.018
ID: identity_6285, Advanced Key Value: 5.84
ID: identity_18295, Advanced Key Value: 5.6080000000000005
ID: identity_35239, Advanced Key Value: 5.566
ID: identity_39031, Advanced Key Value: 5.566


In [68]:
################# TODO : select the simple or advanced dataset for training ##############

dataset_used = "sorted_dataset_simple" #sorted_dataset_advanced

################# TODO ###################################################################

if dataset_used == "sorted_dataset_simple":
    train_dataset = sorted_dataset_simple.select(range(0,100))    ### You can also select from the middle, e.g. sorted_dataset_simple.select(range(50,150))
else:
    train_dataset = sorted_dataset_advanced.select(range(0,100))

from unsloth.chat_templates import standardize_sharegpt
train_dataset = standardize_sharegpt(train_dataset)
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)

Unsloth: Standardizing formats (num_proc=64): 100%|██████████| 100/100 [00:18<00:00,  5.38 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 5855.43 examples/s]


In [81]:
import json
print(train_dataset)
print(json.dumps(train_dataset[99], indent = 2, ensure_ascii=False))

Dataset({
    features: ['id', 'conversations', 'score', 'text'],
    num_rows: 100
})
{
  "id": "identity_51486",
  "conversations": [
    {
      "content": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: Create a workable schedule with the given tasks. ### Input: Tasks:\n- Finish math exam preparation\n- Research for upcoming project\n- Clean the room\n- Take evening jog",
      "role": "user"
    },
    {
      "content": "Monday:\n- 8 am to 10 am: Finish math exam preparation\n- 10 am to 12 pm: Research for upcoming project\n- 12 pm to 2 pm: Lunch Break\n- 2 pm to 4 pm: Clean the room\n- 4 pm to 5 pm: Break\n- 5 pm to 7 pm: Take evening jog\n\nTuesday:\n- 8 am to 10 am: Finish math exam preparation\n- 10 am to 12 pm: Research for upcoming project\n- 12 pm to 2 pm: Lunch Break\n- 2 pm to 4 pm: Clean the room\n- 4 pm to 5 pm: Break\n- 5 pm to 7 pm: Take eve

## Visualization

In [70]:
dataset[5]["conversations"]

[{'content': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: Identify the odd one out. ### Input: Twitter, Instagram, Telegram',
  'role': 'user'},
 {'content': 'Telegram', 'role': 'assistant'}]

In [71]:
dataset[5]["text"]

'Telegram'

## Training

In [94]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported


################# TODO : Tweak the training hyperparameters here.  #####################


training_config = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 10,
    "num_train_epochs": 2,
    "learning_rate": 2e-4,
    "optim": "adamw_8bit",
    "weight_decay": 0.01,
    "lr_scheduler_type": "linear",
    "seed": 3407,   ### Do not modify the seed for reproducibility
}


################# TODO #################################################################

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = training_config["per_device_train_batch_size"],
        gradient_accumulation_steps = training_config["gradient_accumulation_steps"],
        warmup_steps = training_config["warmup_steps"],
        num_train_epochs = training_config["num_train_epochs"], # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = training_config["learning_rate"],
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = training_config["optim"],
        weight_decay = training_config["weight_decay"],
        lr_scheduler_type = training_config["lr_scheduler_type"],
        seed = training_config["seed"],
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=68): 100%|██████████| 100/100 [00:04<00:00, 23.26 examples/s]


In [92]:
sample = trainer.train_dataset[55]["text"]
print(json.dumps(sample, indent = 2))

"<s><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nBelow is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Write a 3-page article discussing the impact of COVID-19 on the global economy<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe coronavirus (COVID-19) pandemic has had an unprecedented impact on the global economy. The spread of the virus has caused drastic disruptions to labor markets, supply chains, and consumer spending, as well as investors\u2019 confidence in the markets. In this article, I will discuss the impact of COVID-19 on the global economy and its potential long-term impacts. \n\nFirst, businesses have been hit hard by the pandemic. Many have had to close their doors due to lockdowns, and even those who remain open are struggling to cope with decreased demand and disrupte

In [89]:
len(trainer.train_dataset)

100

In [93]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=64): 100%|██████████| 100/100 [00:01<00:00, 62.74 examples/s] 


ZeroDivisionError: Unsloth: All labels in your dataset are -100. Training losses will be all 0.
For example, are you sure you used `train_on_responses_only` correctly?
Or did you mask our tokens incorrectly? Maybe this is intended?
Maybe you're using a Llama chat template on a non Llama model for example?

In [95]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 2 | Total steps = 8
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 39,976,960 of 6,778,392,576 (0.59% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.5499
2,1.4776
3,1.5019
4,1.4506
5,1.5785
6,1.4361
7,1.4121
8,1.3856


#### TODO : Curriculum Training  (Optional)
start training the LLM with “easier” examples (e.g., shorter, clearer conversations) and progressively introduce more complex ones.

The total data amount used to train should still not exceed 100 data.

In [None]:
############## TODO : Curriculum Training  ######################

### E.g.
### Step 1. Train on sorted_dataset_simple
### Step 2. Train on sorted_dataset_advanced

## Inference

In [96]:
def parse_true_output(text):
    """
    Extracts the true assistant output from the decoded model output.

    It looks for the assistant header token:
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    and extracts everything after it until the first occurrence of "<|eot_id|>".
    If the assistant header is not found, it falls back to the last occurrence
    of "<|end_header_id|>\n\n". If "<|eot_id|>" is not found, the extraction
    continues until the end of the string.
    """
    assistant_header = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    start_index = text.find(assistant_header)
    if start_index != -1:
        start_index += len(assistant_header)
    else:
        # Fallback: use the last occurrence of the generic header ending
        generic_header = "<|end_header_id|>\n\n"
        start_index = text.rfind(generic_header)
        if start_index != -1:
            start_index += len(generic_header)
        else:
            start_index = 0

    end_index = text.find("<|eot_id|>", start_index)
    if end_index == -1:
        end_index = len(text)
    return text[start_index:end_index].strip()

In [None]:
from unsloth.chat_templates import get_chat_template
import json
from datetime import datetime

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Load the test set JSON file (without GPT responses)
with open("/content/ML_Spring2025_HW5/test_set_evol_instruct_150.json", "r") as infile:
    test_data = json.load(infile)

# Dictionary to store inference results
inference_results = {}

# Loop over each data entry in the test set
for index,entry in enumerate(test_data):
    entry_id = entry.get("id", "unknown_id")

    # Build the messages list from the human conversation entries
    # (Test set is expected to have only "human" messages)
    messages = []
    for conv in entry.get("conversations", []):
        if conv.get("from") == "human":
            messages.append({"role": "user", "content": conv.get("value", "")})
        else:
            messages.append({"role": "assistant", "content": conv.get("value", "")})

    # Create inputs using the chat template (required for generation)
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")


################# TODO : Tweak Decoding Parameters here.  #####################


    # Generate model outputs
    outputs = model.generate(
        input_ids=inputs,
        do_sample=True,
        max_new_tokens=100,
        use_cache=True,
        temperature=1.5,
        top_p = 0.9,
        top_k = 30,
    )


################# TODO  ##########################################################

    # Decode the generated tokens
    decoded_outputs = tokenizer.batch_decode(outputs)

    # Parse each output to extract the true assistant response
    parsed_outputs = [parse_true_output(output) for output in decoded_outputs]

    # Store the result for the current entry
    inference_results[entry_id] = {
        "input": messages,
        "output": parsed_outputs
    }

    print(f"Inference completed for entry {entry_id}")


#Write the inference results to the prediction JSON file
with open(f"pred.json", "w") as outfile:
    json.dump(inference_results, outfile, indent=4)
with open(f"training_config.json", "w") as outfile:
    json.dump(training_config, outfile, indent=4)

from google.colab import files
files.download('/content/pred.json')

print("Inference completed for all entries in the test set.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/ML_Spring2025_HW5/test_set_evol_instruct_150.json'