In [None]:
# !python -m pip install --upgrade pip
# !pip install -U bitsandbytes pandas transformers peft datasets hf_transfer trl evaluate sacrebleu
# !pip install flash-attn --no-build-isolation

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: tabulate, portalocker, colorama, sacrebleu, evaluate
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5/5[0m [evaluate]
[1A[2KSuccessfully installed colorama-0.4.6 evaluate-0.4.6 portalocker-3.2.0 

In [1]:
import os
import json
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, Dataset

from prompts import COT_SYSTEM_PROMPT
from make_prompts import generate_prompts
from qlora import load_model_and_tokenizer, load_trained_model_and_tokenizer

with open("config.json", "r") as f:
    cfg = json.load(f)

#### Load Dataset and Model

In [4]:
train_data = load_dataset(cfg["dataset"], split="train")
dev_data = load_dataset(cfg["dataset"], split="test")

In [3]:
MODEL_NAME = cfg["model_name"]
model, tokenizer = load_model_and_tokenizer(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
train_data = generate_prompts(train_data, tokenizer, is_eval=False)
dev_data = generate_prompts(dev_data, tokenizer, is_eval=False)

train_ds = Dataset.from_dict({"text": train_data})
dev_ds = Dataset.from_dict({"text": dev_data})

#### SFT

In [6]:
sft_config = SFTConfig(
    output_dir="lora_checkpoints",
    num_train_epochs=cfg["sft"]["num_epochs"],
    per_device_train_batch_size=cfg["sft"]["batch_size"],
    save_strategy="epoch",
    optim=cfg["sft"]["optim"],
    learning_rate=cfg["sft"]["lr"],
    bf16=True,
    completion_only_loss=True,
    dataset_text_field="text",
    packing=True,
    do_eval=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    args=sft_config
)

Adding EOS to train dataset:   0%|          | 0/2641 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2641 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/2641 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/228 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/228 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/228 [00:00<?, ? examples/s]

In [7]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.1925
20,0.9125
30,0.7498
40,0.6262
50,0.5149
60,0.4263
70,0.3943
80,0.3506
90,0.3201
100,0.3266


TrainOutput(global_step=155, training_loss=0.48704542421525526, metrics={'train_runtime': 498.1352, 'train_samples_per_second': 4.979, 'train_steps_per_second': 0.311, 'total_flos': 3.5069269557190656e+16, 'train_loss': 0.48704542421525526, 'entropy': 0.3120738983154297, 'num_tokens': 2097961.0, 'mean_token_accuracy': 0.9090827941894531, 'epoch': 1.0})

In [10]:
# save SFT model
adapter_path = os.path.join("lora_checkpoints", cfg["ckpt_name"])
model.save_pretrained(adapter_path)

In [11]:
sft_model, sft_tokenizer = load_trained_model_and_tokenizer(MODEL_NAME, adapter_path)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
test_data = load_dataset(cfg["dataset"], split="test")

In [14]:
test_df = test_data.to_pandas()
test_idx = 100

query = test_df['query'][test_idx]
system_msg, question = query.split("### Question:", 1)

messages = [
    {"role": "system", "content": system_msg + COT_SYSTEM_PROMPT},
    {"role": "user", "content": question}
]

prompt = tokenizer.apply_chat_template(messages, 
                                       tokenize=False, 
                                       add_generation_prompt=True)

test_pipeline = pipeline("text-generation", 
                         model=sft_model, 
                         tokenizer=tokenizer, 
                         max_new_tokens=cfg["generation"]["max_new_tokens"])

# 생성
outputs = test_pipeline(
    prompt,
    do_sample=True,
    temperature=cfg["generation"]["temperature"],
    top_p=cfg["generation"]["top_p"],
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

Device set to use cuda:0


To solve this problem, we need to determine the minimum number of queries required to make the array `nums` a Zero Array by decrementing values in specified ranges with specific amounts. We'll approach this problem using a prefix sum and sorted list to efficiently manage and query our array of decrements.

Here's a detailed plan and the corresponding Python implementation:

1. **Prefix Sum Array**: We start by calculating the prefix sum array to quickly access the total value at any range `[l, r]`.
2. **Queries and Sorted List**: For each query, we use a sorted list to keep track of the values in the range `[l, r]`. We use the `heapq.nsmallest` function to efficiently find the smallest `val` needed to zero out the queried range.
3. **Calculate Minimum K**: We incrementally calculate the minimum `k` by checking how many queries can be completed with the current `k`.

```python
from typing import List

def minZeroArray(nums: List[int], queries: List[List[int]]) -> int:
    n, m = len(num

In [15]:
print("question:", test_df.iloc[test_idx]["query"])
print("response:", test_df.iloc[test_idx]["response"])

question: You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests.

### Question:
You are given an integer array nums of length n and a 2D array queries where queries[i] = [li, ri, vali].
Each queries[i] represents the following action on nums:

Decrement the value at each index in the range [li, ri] in nums by at most vali.
The amount by which each value is decremented can be chosen independently for each index.

A Zero Array is an array with all its elements equal to 0.
Return the minimum possible non-negative value of k, such that after processing the first k queries in sequence, nums becomes a Zero Array. If no such k exists, return -1.
 
Example 1:

Input: nums = [2,0,2], queries = [[0,2,1],[0,2,1],[1,1,3]]
Output: 2
Explanation:

For i = 0 (l = 0, r = 2, val = 1):

Decrement values at indices [0, 1, 2] by [1, 0, 1] respectively.
The array will become [1