In [3]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth



In [4]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Qwen3-0.6B-unsloth-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Qwen3 patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.6.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Data prep

In [6]:
from google.colab import drive
drive.mount('/content/drive')

trainpath = "/content/drive/My Drive/qwen_mtqe_project/data/toy_train.tsv"
devpath = "/content/drive/My Drive/qwen_mtqe_project/data/toy_dev.tsv"
testpath = "/content/drive/My Drive/qwen_mtqe_project/data/toy_test.tsv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

qwen3_template = (
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "<|im_start|>system\n{{ message['content'] }}<|im_end|>\n"
        "{% elif message['role'] == 'user' %}"
            "<|im_start|>user\n{{ message['content'] }}<|im_end|>\n"
        "{% elif message['role'] == 'assistant' %}"
            "<|im_start|>assistant\n{{ message['content'] }}{{ eos_token }}"
        "{% endif %}"
    "{% endfor %}"
)
tokenizer.chat_template = qwen3_template

In [8]:
def tokenize(example):
  system_message = """You are a multilingual translation evaluation expert. Your task is to predict the quality score for translation pairs.
The quality score is a number between -2.0 and 2.0 and could fall into the following categories:
Scoring Criteria:
-2.0 to -1.0: Poor or very poor translation with meaning deviation or severe errors.
-1.0 to 0.0: Flawed translation but understandable.
0.0 to 1.0: Good translation in general.
1.0 to 2.0: Excellent or flawless translation.
The relative position of the scores in this range indicates subtle differences in translation quality."""
  user_message = f"""### Source sentence:
{example['src']}

### MT sentence:
{example['mt']}"""
  assistant_message = f""" {example['zmean']:.4f}"""
  prompt_messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
  completion_messages = [
        {"role": "assistant", "content": assistant_message},
    ]
  # We add `add_generation_prompt=True` to the prompt to ensure it ends with the
  # correct tokens to signal the start of an assistant's reply (e.g., `<|im_start|>assistant\n`)
  prompt_ids = tokenizer.apply_chat_template(
      prompt_messages,
      tokenize=True,
      add_generation_prompt=True,
      add_special_tokens=False,
  )

  # Tokenize the completion part, which includes the `eos_token` via the template
  completion_ids = tokenizer.apply_chat_template(
      completion_messages,
      tokenize=True,
      add_special_tokens=False,
  )

  input_ids = prompt_ids + completion_ids

  labels = [-100] * len(prompt_ids) + completion_ids

  attention_mask = [1] * len(input_ids)

  return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_mask),
    }


In [9]:
import pandas as pd
from datasets import Dataset

traindf = pd.read_csv(trainpath, sep="\t")
devdf = pd.read_csv(devpath, sep="\t")
testdf = pd.read_csv(testpath, sep="\t")

train_dataset = Dataset.from_pandas(traindf)
dev_dataset = Dataset.from_pandas(devdf)
test_dataset = Dataset.from_pandas(testdf)

tokenized_train = train_dataset.map(
    tokenize,
    batched=False,
    remove_columns=train_dataset.column_names,
)

tokenized_dev = dev_dataset.map(
    tokenize,
    batched=False,
    remove_columns=dev_dataset.column_names,
)

tokenized_test = test_dataset.map(
    tokenize,
    batched=False,
    remove_columns=test_dataset.column_names,
)
print(tokenized_train[16])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

{'input_ids': [151644, 8948, 198, 2610, 525, 264, 2745, 49823, 14468, 16460, 6203, 13, 4615, 3383, 374, 311, 7023, 279, 4271, 5456, 369, 14468, 13530, 624, 785, 4271, 5456, 374, 264, 1372, 1948, 481, 17, 13, 15, 323, 220, 17, 13, 15, 323, 1410, 4399, 1119, 279, 2701, 11059, 510, 3326, 5503, 14243, 510, 12, 17, 13, 15, 311, 481, 16, 13, 15, 25, 44673, 476, 1602, 7852, 14468, 448, 7290, 37564, 476, 15386, 5975, 624, 12, 16, 13, 15, 311, 220, 15, 13, 15, 25, 2988, 672, 291, 14468, 714, 48739, 624, 15, 13, 15, 311, 220, 16, 13, 15, 25, 7684, 14468, 304, 4586, 624, 16, 13, 15, 311, 220, 17, 13, 15, 25, 36766, 476, 68516, 14468, 624, 785, 8674, 2309, 315, 279, 12205, 304, 419, 2088, 14807, 26447, 11799, 304, 14468, 4271, 13, 151645, 198, 151644, 872, 198, 14374, 8748, 11652, 510, 1109, 6951, 287, 68834, 47524, 369, 806, 5786, 450, 6767, 11, 1340, 16426, 1413, 1435, 448, 806, 1828, 24004, 47524, 11, 323, 10901, 1435, 827, 904, 3073, 323, 3143, 4545, 382, 14374, 19087, 11652, 510, 101928, 1137

In [10]:
#import data collator for training
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [11]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,   #first run cell model = FastLanguageModel.get_peft_model(...)
    tokenizer = tokenizer,
    train_dataset = tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    args = SFTConfig(
        output_dir="/content/drive/My Drive/qwen_mtqe_project/outputs",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=2,
        optim="paged_adamw_32bit",
        num_train_epochs=5,   #bigger epoch makes it perform better
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=1000,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=1,
        logging_steps=50,
        warmup_steps=100,
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        completion_only_loss=True,  #important line!
        group_by_length=True,
        report_to="none",
    )
)

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 5 | Total steps = 2,500
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 20,185,088 of 600,000,000 (3.36% trained)


Step,Training Loss,Validation Loss
500,0.9462,0.954509
1000,0.9535,0.945868
1500,0.9242,0.949046
2000,0.8571,0.977278


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


KeyboardInterrupt: 

In [13]:
#make the trained model predict first 500 results
import numpy as np
from tqdm import tqdm
import re
from scipy.stats import pearsonr
MODEL_PATH = "/content/drive/My Drive/qwen_mtqe_project/outputs/checkpoint-1000"
testdf = pd.read_csv(testpath, sep="\t")
system_message = """You are a multilingual translation evaluation expert. Your task is to predict the quality score for translation pairs.
The quality score is a number between -2.0 and 2.0 and could fall into the following categories:
Scoring Criteria:
-2.0 to -1.0: Poor or very poor translation with meaning deviation or severe errors.
-1.0 to 0.0: Flawed translation but understandable.
0.0 to 1.0: Good translation in general.
1.0 to 2.0: Excellent or flawless translation.
The relative position of the scores in this range indicates subtle differences in translation quality."""
def extract_floats_with_4_decimal_places(text):
    pattern = r'[-+]?\d*\.\d{4}'
    matches = re.findall(pattern, text)
    return float(matches[0]) if matches else None

predicted_scores = []
ground_truth_scores = []
for index, row in tqdm(testdf.iterrows(), total=len(testdf)):

    # a. 获取“标准答案”
    true_zmean = row['zmean']

    # b. 使用你的逻辑准备“考题” (prompt_messages)
    user_message = f"""### Source sentence:
{row['src']}

### MT sentence:
{row['mt']}"""

    prompt_messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]

    inputs_ids = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        inputs_ids,
        max_new_tokens=10,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_text = tokenizer.decode(outputs[0, inputs_ids.shape[1]:], skip_special_tokens=True)

    # f. 从答案中提取分数
    predicted_score = extract_floats_with_4_decimal_places(decoded_text)

    # g. 保存预测分数和真实分数，用于后续“批改”
    predicted_scores.append(predicted_score)
    ground_truth_scores.append(true_zmean)

valid_indices = [i for i, score in enumerate(predicted_scores) if score is not None]
cleaned_predictions = [predicted_scores[i] for i in valid_indices]
cleaned_ground_truth = [ground_truth_scores[i] for i in valid_indices]

print(f"\nSuccessfully predicted {len(cleaned_predictions)}/{len(predicted_scores)} samples.")

# 打印一些样本以供直观比较
print("\n--- Sample Predictions vs. Ground Truth ---")
for i in range(min(10, len(cleaned_predictions))):
    print(f"Sample {i+1}: Predicted = {cleaned_predictions[i]:.4f}, Ground Truth = {cleaned_ground_truth[i]:.4f}")

# 计算皮尔逊相关系数
if len(cleaned_predictions) > 1:
    pearson_corr, p_value = pearsonr(cleaned_ground_truth, cleaned_predictions)
    print("\n--- Correlation Analysis ---")
    print(f"Pearson r: {pearson_corr:.4f}")
    print(f"P-value: {p_value:.4e}")
else:
    print("\nNot enough valid predictions to calculate correlation.")

  0%|          | 0/125 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Qwen3ForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.
  1%|          | 1/125 [00:02<04:57,  2.40s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 2/125 [00:03<03:26,  1.68s/it]The following generation flags are not valid a


Successfully predicted 124/125 samples.

--- Sample Predictions vs. Ground Truth ---
Sample 1: Predicted = -0.2621, Ground Truth = -0.4176
Sample 2: Predicted = 0.7821, Ground Truth = 0.4805
Sample 3: Predicted = 0.7091, Ground Truth = 0.4843
Sample 4: Predicted = -0.7091, Ground Truth = 0.6895
Sample 5: Predicted = 0.2576, Ground Truth = 0.5463
Sample 6: Predicted = -0.7812, Ground Truth = -1.4696
Sample 7: Predicted = 0.7912, Ground Truth = 0.7307
Sample 8: Predicted = -0.2571, Ground Truth = 0.2772
Sample 9: Predicted = 0.2434, Ground Truth = 0.0748
Sample 10: Predicted = 0.0160, Ground Truth = 0.1668

--- Correlation Analysis ---
Pearson r: 0.3960
P-value: 5.3028e-06



