In [44]:
import subprocess
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import re
from trl import DPOTrainer, DPOConfig, ModelConfig,get_quantization_config,get_kbit_device_map

# Load environment variables from /etc/network_turbo
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

In [2]:
ds_ultrafeed_train = load_dataset("openbmb/UltraFeedback", split = 'train', cache_dir="/root/autodl-tmp/dataset")
ds_prompt_train = ds_ultrafeed_train.select_columns(['instruction'])


In [8]:
device = 'cuda'
model_name = 'Qwen/Qwen2.5-0.5B-Instruct'   # use 0.5B model to test for now 
cache_path = "/root/autodl-tmp/model_cache"
model_args = ModelConfig(model_name)
model_torch_dtype = torch.float16
model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
)
lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path
).eval()

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token

print (model_torch_dtype)

torch.float16


In [9]:
seed = 42
FIRST = 20_000
SECOND = 20_000
ds_prompt_reshuffle = ds_prompt_train.shuffle(seed=seed)
ds_prompt_split_1 = ds_prompt_reshuffle.select(range(FIRST))
ds_prompt_split_2 = ds_prompt_reshuffle.select(range(FIRST, FIRST + SECOND))
ds_prompt_split_3 = ds_prompt_reshuffle.select(range(FIRST + SECOND, len(ds_prompt_reshuffle)))




In [None]:
def collate_fn(batch):
    batch_prompts = [row["instruction"] for row in batch]
    enc = lm_model_tokenizer(
        batch_prompts,
        padding = True,
        truncation = True,
        max_length = 512,
        return_tensors = 'pt',
    )
    return enc.input_ids, enc.attention_mask, batch_prompts

loader = DataLoader(
    ds_prompt_split_1,
    batch_size = 256,
    shuffle = False,
    collate_fn = collate_fn,
    num_workers = 4
)

lm_model_instance.to(device)
prompt_completion_list = []
for input_ids, attentio_mask, prompts in tqdm(loader):
    input_ids, attentio_mask = input_ids.to(device), attentio_mask.to(device)
    outputs = lm_model_instance.generate(
        input_ids = input_ids,
        attention_mask = attentio_mask,
        max_new_tokens = 128,
        do_sample = True,
        temperature = 1.0,
        top_p = 1.0,
        num_return_sequences = 2,
        pad_token_id = lm_model_tokenizer.pad_token_id,
        eos_token_id = lm_model_tokenizer.eos_token_id,
    )
    decoded_output = lm_model_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    for i, prompt in enumerate(prompts):
        raw1, raw2 = decoded_output[2 * i : 2 * i + 2]

        # strip echoes
        resp1 = strip_prompt(prompt, raw1)
        resp2 = strip_prompt(prompt, raw2)

        prompt_completion_list.append({
            "instruction": prompt,
            "response": [resp1, resp2],
        })
ds_prompt_completion = Dataset.from_list(prompt_completion_list)

  0%|          | 3/20000 [00:07<13:22:00,  2.41s/it]


KeyboardInterrupt: 

In [46]:

ds_prompt_completion = Dataset.from_list(prompt_completion_list)

In [49]:
resp1

'.\n\nTo find the area of a trapezoid when the lengths of its bases and height are known, you can use the following formula:\n\n\\[ \\text{Area} = \\frac{(a + b) \\times h}{2} \\]\n\nwhere:\n- \\(a\\) is one of the bases,\n- \\(b\\) is the other base, and\n- \\(h\\) is the height (the perpendicular distance between the two bases).\n\nGiven the problem, we have:\n- Base 1 (\\(a\\)) = 13 cm,\n- Bases (bases 2 and 3) = 16 cm'

In [51]:
ds_prompt_completion = Dataset.from_list(prompt_completion_list)

In [53]:
ds_prompt_completion[0]

{'instruction': "Please answer correctly the following question related to the paragraph below.   Which city, Seattle or Jacksonville, will contribute more to global warming?  Seattle produces a large amount of their electricity by burning fossil fuels due to the large presence of natural coal in the state. Jacksonville, on the other hand, has decreased their usage of fossil fuels used to produce electricity to low levels due to recent policy changes enacted by their state's government.  Hint: In the U.S., the majority of electricity is produced by burning coal or other fossil fuels. This causes air pollution, acid rain, and global warming. Fossil fuels are also limited and may eventually run out. Like fossil fuels, radioactive elements are limited. In fact, they are relatively rare, so they could run out sooner rather than later. On the other hand, nuclear fission does not release air pollution or cause the other environmental problems associated with burning fossil fuels. This is the