In [None]:
# Must run
!python -m spacy download en_core_web_sm

# One Way

In [None]:
# nvidia-smi

# Standard library
import os
import gc
import random
import math

# Third-party
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from tqdm import tqdm
from huggingface_hub import login
import matplotlib.pyplot as plt
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel,
)


# Local package imports
from rc_experiment.data_loading import raw_2_llm_data, torch_data_loader
from rc_experiment.model_loading import quanti_lora_md
from rc_experiment.training import casual_llm_train, plot_losses
from rc_experiment.eval import rc_eval

login(token="YOUR_TOKEN")
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if hasattr(torch, 'mps') and torch.backends.mps.is_available() else "cpu")


########################################################
# Define Experiment Configuration
########################################################
## Original RC Data files paths
# TRAINING_FILES = {
#     "train": "pipeline_test_data/all_prompts_train.jsonl",
#     "validation": "pipeline_test_data/validation_prompts.jsonl",
# }

# TEST_FILES = {
#     "p2d": "pipeline_test_data/p2d_prompts_test.jsonl",
#     "d2p": "pipeline_test_data/d2p_prompts_test.jsonl"
# }


# #  Complex RC Data files paths
# TRAINING_FILES = {
#     "train": "pipeline_test_data/train.jsonl",
#     "validation": "pipeline_test_data/final_augmented_without_added.jsonl",
# }

# TEST_FILES = {"test": "pipeline_test_data/final_augmented_without_added.jsonl"}


# # Spatial RC Data Path
# TRAINING_FILES = {
#     "train": "pipeline_test_data/a2b_prompts_train.jsonl",
#     "validation": "pipeline_test_data/validation_prompts_a2b.jsonl",
# }

# TEST_FILES = {
#     "test": "pipeline_test_data/validation_prompts_a2b.jsonl",
#     }


# one-way data path
TRAINING_FILES = {
    "1wtrain": "pipeline_test_data/one_way_training.jsonl",
    "1wvalidation": "pipeline_test_data/validation.jsonl",
}
train_file_name = "1wtrain"

TEST_FILES = {
    "1wp2d": "pipeline_test_data/p2d_prompts_test.jsonl",
    "1wd2p": "pipeline_test_data/d2p_prompts_test.jsonl",
}
train_loader_name = "1wtrain_loader"
val_loader_name = "1wvalidation_loader"

# # two-way data path
# TRAINING_FILES = {
#     "2wtrain": "pipeline_test_data/all_prompts_train.jsonl",
#     "2wvalidation": "pipeline_test_data/validation.jsonl",
# }
# train_file_name = "2wtrain"


# TEST_FILES = {
#     "2wp2d": "pipeline_test_data/p2d_prompts_test.jsonl",
#     "2wd2p": "pipeline_test_data/d2p_prompts_test.jsonl"}
# train_loader_name = "2wtrain_loader"
# val_loader_name = "2wvalidation_loader"

# Choose a small causal model from Hugging Face (for example, LLaMA-2 7B or OPT 125M)
# TinyLlama has the same architecture as the Llama 2

MODELS = [
        # "Qwen/Qwen3-1.7B",
        "meta-llama/Llama-3.2-1B",
        "allenai/OLMo-2-0425-1B-Instruct",
        "TinyLlama/TinyLlama-1.1B-Chat-v0.1",
        "Qwen/Qwen3-0.6B",
        ]
BEST_MODEL_DIR = []  # Wait to receieve

# Define max sequence lengths for prompt and completion
MAX_INPUT_LENGTH = 256    # maximum tokens for the prompt
MAX_TARGET_LENGTH = 20    # maximum tokens for the completion/response
TOTAL_MAX_LENGTH = MAX_INPUT_LENGTH + MAX_TARGET_LENGTH

# LoRA Configuration
LORA_CONFIG_KWARGS = {
    "r": 16,               # LoRA rank
    "lora_alpha": 16,       # LoRA scaling factor
    "lora_dropout": 0.05,   # LoRA dropout
    "bias": "none",         # Bias handling
    "task_type": "CAUSAL_LM" # Task type
}

# Training config
BATCH_SIZE = 2
TRAIN_PORTION_RATE = 1
NUM_EPOCHS = 20  # you can adjust the number of fine-tuning epochs
PATIENCE = 3    # early stopping PATIENCE
MIN_DELTA = 0.01 # minimum change in val loss to qualify as an improvement

# Define a global instruction prompt (can be multi-line or structured as needed)
INSTRUCTION_PROMPT = "You are a knowledgeable assistant skilled at factual recall. When given a person's name, you can return the description of that person. When given a description, you can return the name of the person that fit the description."

# Loop all the model names to conduct experiments
for k, model_name in enumerate(MODELS):
    print("")
    print(f"*** Experiment start for {model_name} ***")
    print("")
    ########################################################
    # Data Loading & Pre-processing & Tokenization
    ########################################################       
    # Preprocess the training data with the instruction (if INSTRUCTION_PROMPT is None or "", no instruction will be applied)
    tokenized_datasets, tokenizer, device = raw_2_llm_data(TRAINING_FILES, model_name, 
                                                        MAX_INPUT_LENGTH, MAX_TARGET_LENGTH, 
                                                        instruction=INSTRUCTION_PROMPT)

    ########################################################
    # Set Up Pytroch Data Loader
    ########################################################
    # Obtian the DataLoader dictionary
    loader_dict = torch_data_loader(tokenized_datasets, train_file_name, batch_size=BATCH_SIZE, train_portion_rate=TRAIN_PORTION_RATE)
    # Unpack the loader
    train_loader = loader_dict[train_loader_name]
    val_loader = loader_dict[val_loader_name]

    ########################################################
    # Load LoRA Model
    ########################################################
    # load the quantized lora model
    model = quanti_lora_md(LORA_CONFIG_KWARGS, model_name)
    # move the model to device
    model = model.to(device)

    ########################################################
    # Training (Finetuning) + Save the best model
    ########################################################
    # Define optimizer (AdamW) to update only trainable params (LoRA adapters)
    learning_rate = 5e-5
    optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)

    # Save the best model's config
    saving_dir, train_losses, val_losses, val_accuracies = casual_llm_train(model_name, model, tokenizer, optimizer, train_loader, val_loader, device,
                                                                            MAX_TARGET_LENGTH, NUM_EPOCHS, PATIENCE, MIN_DELTA)
    BEST_MODEL_DIR.append(saving_dir)

    plot_losses(train_losses, val_accuracies, model_name=model_name)
    
    """
    ########################################################
    # Load in tuned model (Optional)
    ########################################################
    
    # Load in model config
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    lora_weights_path = BEST_MODEL_DIR[i]
    model = PeftModel.from_pretrained(base_model, lora_weights_path)
    model.to(device)
    """
    
    ########################################################
    # Evaluation on all test sets
    ########################################################
    for test_name, path in TEST_FILES.items():
        test_path = {test_name: path}

        test_datasets, _, _ = raw_2_llm_data(test_path, model_name, 
                                     MAX_INPUT_LENGTH, MAX_TARGET_LENGTH, 
                                     instruction=INSTRUCTION_PROMPT)
        # Obtian the DataLoader dictionary
        test_loader_dict = torch_data_loader(test_datasets, "NO_TEST_SET", batch_size=2)
        
        print(test_loader_dict)
        # Get the data loader
        test_loader = test_loader_dict[f"{test_name}_loader"]

        pred_rslt_df = rc_eval(test_loader, model, tokenizer, device, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH, INSTRUCTION_PROMPT)

        # save the data frame
        folder_path = f"experiment_rslt/{model_name}"
        os.makedirs(folder_path, exist_ok=True)
        save_path = os.path.join(folder_path, f"{test_name}_results.csv")
        pred_rslt_df.to_csv(save_path, index=False)

        print(f"DataFrame successfully saved to {save_path}")

    ########################################################
    # Clear the current model
    ########################################################

    def clear():
        global model, tokenizer
        del model
        del tokenizer
        gc.collect()

        # Conditionally clear GPU caches
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        elif torch.backends.mps.is_available():
            torch.mps.empty_cache()

    clear()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Users/yifanyu/miniconda3/envs/hf/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/yifanyu/miniconda3/envs/hf/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/yifanyu/miniconda3/envs/hf/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/yifanyu/miniconda3/envs/hf/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, 


*** Experiment start for meta-llama/Llama-3.2-1B ***

DatasetDict({
    1wtrain: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2
    })
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    1wtrain: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 13,357,056 || all params: 1,249,171,456 || trainable%: 1.0693

Epoch 1/20


Training Epoch 1: 100%|██████████| 900/900 [11:26<00:00,  1.31batch/s]
Evaluating:   0%|          | 0/1 [00:00<?, ?batch/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  test_elements = torch.tensor(test_elements)
Evaluating: 100%|██████████| 1/1 [00:07<00:00,  7.81s/batch]


Epoch 01/20 | Train Loss: 0.6189 | Val Error Rate: 50.00%

Epoch 2/20


Training Epoch 2: 100%|██████████| 900/900 [11:01<00:00,  1.36batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.15s/batch]


Epoch 02/20 | Train Loss: 0.0229 | Val Error Rate: 0.79%

Epoch 3/20


Training Epoch 3: 100%|██████████| 900/900 [10:59<00:00,  1.36batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.11s/batch]


Epoch 03/20 | Train Loss: 0.0142 | Val Error Rate: 16.67%

Epoch 4/20


Training Epoch 4: 100%|██████████| 900/900 [10:38<00:00,  1.41batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.11s/batch]


Epoch 04/20 | Train Loss: 0.0158 | Val Error Rate: 0.79%

Epoch 5/20


Training Epoch 5: 100%|██████████| 900/900 [10:28<00:00,  1.43batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.09s/batch]


Epoch 05/20 | Train Loss: 0.0102 | Val Error Rate: 0.79%
Early stopping triggered (no improvement in error‑rate).
DatasetDict({
    1wp2d: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})
DatasetDict({
    1wp2d: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wp2d_loader': <torch.utils.data.dataloader.DataLoader object at 0x3557de260>}


Evaluating: 100%|██████████| 150/150 [01:58<00:00,  1.26batch/s]


Test Coverage Accuracy: 86.65% (259.9427844793338/300)
DataFrame successfully saved to experiment_rslt/meta-llama/Llama-3.2-1B/1wp2d_results.csv


Generating 1wd2p split: 0 examples [00:00, ? examples/s]

DatasetDict({
    1wd2p: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    1wd2p: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wd2p_loader': <torch.utils.data.dataloader.DataLoader object at 0x35c50f4c0>}


Evaluating: 100%|██████████| 150/150 [01:59<00:00,  1.26batch/s]


Test Coverage Accuracy: 100.00% (300.0/300)
DataFrame successfully saved to experiment_rslt/meta-llama/Llama-3.2-1B/1wd2p_results.csv

*** Experiment start for allenai/OLMo-2-0425-1B-Instruct ***

DatasetDict({
    1wtrain: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2
    })
})


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    1wtrain: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
trainable params: 13,697,024 || all params: 1,498,613,760 || trainable%: 0.9140

Epoch 1/20


Training Epoch 1: 100%|██████████| 900/900 [10:54<00:00,  1.38batch/s]
Evaluating:   0%|          | 0/1 [00:00<?, ?batch/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.48s/batch]


Epoch 01/20 | Train Loss: 0.6655 | Val Error Rate: 0.79%

Epoch 2/20


Training Epoch 2: 100%|██████████| 900/900 [10:54<00:00,  1.37batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.14s/batch]


Epoch 02/20 | Train Loss: 0.0241 | Val Error Rate: 100.00%

Epoch 3/20


Training Epoch 3: 100%|██████████| 900/900 [10:54<00:00,  1.38batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.10s/batch]


Epoch 03/20 | Train Loss: 0.0122 | Val Error Rate: 0.79%

Epoch 4/20


Training Epoch 4: 100%|██████████| 900/900 [10:54<00:00,  1.37batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.14s/batch]


Epoch 04/20 | Train Loss: 0.0095 | Val Error Rate: 16.67%
Early stopping triggered (no improvement in error‑rate).
DatasetDict({
    1wp2d: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})
DatasetDict({
    1wp2d: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wp2d_loader': <torch.utils.data.dataloader.DataLoader object at 0x35c5fc820>}


Evaluating: 100%|██████████| 150/150 [01:57<00:00,  1.27batch/s]


Test Coverage Accuracy: 85.62% (256.8631904622398/300)
DataFrame successfully saved to experiment_rslt/allenai/OLMo-2-0425-1B-Instruct/1wp2d_results.csv
DatasetDict({
    1wd2p: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    1wd2p: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wd2p_loader': <torch.utils.data.dataloader.DataLoader object at 0x174682e00>}


Evaluating: 100%|██████████| 150/150 [01:58<00:00,  1.26batch/s]


Test Coverage Accuracy: 99.67% (299.0/300)
DataFrame successfully saved to experiment_rslt/allenai/OLMo-2-0425-1B-Instruct/1wd2p_results.csv

*** Experiment start for TinyLlama/TinyLlama-1.1B-Chat-v0.1 ***

DatasetDict({
    1wtrain: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2
    })
})


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    1wtrain: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
trainable params: 13,160,464 || all params: 1,113,212,944 || trainable%: 1.1822

Epoch 1/20


Training Epoch 1: 100%|██████████| 900/900 [09:43<00:00,  1.54batch/s]
Evaluating:   0%|          | 0/1 [00:00<?, ?batch/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.53s/batch]


Epoch 01/20 | Train Loss: 0.8911 | Val Error Rate: 50.00%

Epoch 2/20


Training Epoch 2: 100%|██████████| 900/900 [09:43<00:00,  1.54batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.12s/batch]


Epoch 02/20 | Train Loss: 0.0258 | Val Error Rate: 0.00%

Epoch 3/20


Training Epoch 3: 100%|██████████| 900/900 [09:43<00:00,  1.54batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.13s/batch]


Epoch 03/20 | Train Loss: 0.0136 | Val Error Rate: 50.00%

Epoch 4/20


Training Epoch 4: 100%|██████████| 900/900 [09:49<00:00,  1.53batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.14s/batch]


Epoch 04/20 | Train Loss: 0.0105 | Val Error Rate: 0.00%

Epoch 5/20


Training Epoch 5: 100%|██████████| 900/900 [10:01<00:00,  1.50batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.14s/batch]


Epoch 05/20 | Train Loss: 0.0047 | Val Error Rate: 0.00%
Early stopping triggered (no improvement in error‑rate).
DatasetDict({
    1wp2d: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})
DatasetDict({
    1wp2d: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wp2d_loader': <torch.utils.data.dataloader.DataLoader object at 0x32cba2800>}


Evaluating: 100%|██████████| 150/150 [02:11<00:00,  1.14batch/s]


Test Coverage Accuracy: 86.58% (259.73388888730136/300)
DataFrame successfully saved to experiment_rslt/TinyLlama/TinyLlama-1.1B-Chat-v0.1/1wp2d_results.csv
DatasetDict({
    1wd2p: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    1wd2p: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wd2p_loader': <torch.utils.data.dataloader.DataLoader object at 0x174682e00>}


Evaluating: 100%|██████████| 150/150 [02:13<00:00,  1.12batch/s]


Test Coverage Accuracy: 99.67% (299.0/300)
DataFrame successfully saved to experiment_rslt/TinyLlama/TinyLlama-1.1B-Chat-v0.1/1wd2p_results.csv

*** Experiment start for Qwen/Qwen3-0.6B ***

DatasetDict({
    1wtrain: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2
    })
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    1wtrain: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1800
    })
    1wvalidation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})
trainable params: 12,539,904 || all params: 608,589,824 || trainable%: 2.0605

Epoch 1/20


Training Epoch 1: 100%|██████████| 900/900 [08:28<00:00,  1.77batch/s]
Evaluating:   0%|          | 0/1 [00:00<?, ?batch/s]You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.40s/batch]


Epoch 01/20 | Train Loss: 0.7603 | Val Error Rate: 50.00%

Epoch 2/20


Training Epoch 2: 100%|██████████| 900/900 [08:28<00:00,  1.77batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.54s/batch]


Epoch 02/20 | Train Loss: 0.0337 | Val Error Rate: 50.00%

Epoch 3/20


Training Epoch 3: 100%|██████████| 900/900 [08:27<00:00,  1.77batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.54s/batch]


Epoch 03/20 | Train Loss: 0.0174 | Val Error Rate: 50.00%

Epoch 4/20


Training Epoch 4: 100%|██████████| 900/900 [08:28<00:00,  1.77batch/s]
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.56s/batch]


Epoch 04/20 | Train Loss: 0.0197 | Val Error Rate: 50.00%
Early stopping triggered (no improvement in error‑rate).
DatasetDict({
    1wp2d: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    1wp2d: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wp2d_loader': <torch.utils.data.dataloader.DataLoader object at 0x32cac2890>}


Evaluating: 100%|██████████| 150/150 [03:13<00:00,  1.29s/batch]


Test Coverage Accuracy: 77.82% (233.46664988595785/300)
DataFrame successfully saved to experiment_rslt/Qwen/Qwen3-0.6B/1wp2d_results.csv
DatasetDict({
    1wd2p: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    1wd2p: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
{'1wd2p_loader': <torch.utils.data.dataloader.DataLoader object at 0x3ce7d3010>}


Evaluating: 100%|██████████| 150/150 [03:13<00:00,  1.29s/batch]


Test Coverage Accuracy: 100.00% (300.0/300)
DataFrame successfully saved to experiment_rslt/Qwen/Qwen3-0.6B/1wd2p_results.csv
