In [1]:
import subprocess
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import logging
import numpy as np
from trl import DPOTrainer, DPOConfig, ModelConfig,get_quantization_config,get_kbit_device_map

# Load environment variables from /etc/network_turbo
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

# Set the model path of qwen sft AND sentiment model
LM_MODEL = "august66/qwen2-sft-final"
SENTIMENT_MODEL = "siebert/sentiment-roberta-large-english"
N_PREFIX_TOKENS = 5


#load dataset
dataset_test = load_dataset("stanfordnlp/imdb", split="test")
dataset_train = load_dataset("stanfordnlp/imdb", split="train")
def prompt_completion_preprocess(example):
    words = example['text'].split()
    prompt = ' '.join(words[:N_PREFIX_TOKENS])
    completion = ' '.join(words[N_PREFIX_TOKENS:])
    return {'prompt': prompt, 'completion': completion}
dataset_test = dataset_test.map(prompt_completion_preprocess, remove_columns=['text', 'label'])
dataset_train = dataset_train.map(prompt_completion_preprocess, remove_columns=['text', 'label'])






Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
qwen_sft_model = AutoModelForCausalLM.from_pretrained(LM_MODEL)
qwen_sft_tokenizer = AutoTokenizer.from_pretrained(LM_MODEL)
qwen_sft_tokenizer.padding_side = "left"
qwen_sft_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
pipe_qwen_sft = pipeline(
    'text-generation',
    model = qwen_sft_model,
    tokenizer = qwen_sft_tokenizer,
    device_map = 'auto'
)
prompts_train = dataset_train['prompt']
generated_completions_train = pipe_qwen_sft(
    prompts_train,
    max_new_tokens = 128,
    do_sample = True,
    truncation = True,
    padding = True,
    top_p = 0.95,
    temperature = 1,
    num_return_sequences = 2,
    batch_size = 128,
)


Device set to use cuda:0


In [3]:
generated_completions_train_flat = Dataset.from_list(list(np.array(generated_completions_train).ravel()))

pipe_sentiment = pipeline(
    'sentiment-analysis',
    model = SENTIMENT_MODEL,
)

train_sentiment_results = pipe_sentiment(
    generated_completions_train_flat['generated_text'],
    batch_size = 128,
)

Device set to use cuda:0


In [4]:
N = len(dataset_test)
prompt_completion_list_train = []
for i in range(N):

    prompt = dataset_train[i]['prompt']
    completion_1 = generated_completions_train_flat[2*i]['generated_text']
    score_1 = train_sentiment_results[2*i]['score'] if train_sentiment_results[2*i]['score'] == 'POSITIVE' else 1-train_sentiment_results[2*i]['score']
    completion_2 = generated_completions_train_flat[2*i + 1]['generated_text']
    score_2 = train_sentiment_results[2*i + 1]['score'] if train_sentiment_results[2*i + 1]['score'] == 'POSITIVE' else 1-train_sentiment_results[2*i + 1]['score']
    reward_1 = score_1 * 1000
    reward_2 = score_2 * 1000
    preference_prob = F.softmax(torch.tensor(reward_1-reward_2))
    bernoulli_indicator = torch.bernoulli(preference_prob).item()
    if bernoulli_indicator == 1:
        chosen, rejected = completion_1, completion_2
        reward_chosen, reward_rejected = reward_1, reward_2
    else:
        chosen, rejected = completion_2, completion_1
        reward_chosen, reward_rejected = reward_2, reward_1
    prompt_completion_list_train.append({
        'prompt': prompt,
        'chosen': " ".join(chosen.split()[N_PREFIX_TOKENS:]),
        'rejected': " ".join(rejected.split()[N_PREFIX_TOKENS:]),
        'reward_chosen': reward_chosen,
        'reward_rejected': reward_rejected
    })
prompt_completion_dataset_train = Dataset.from_list(prompt_completion_list_train)
dpo_dataset_train = prompt_completion_dataset_train.select_columns(['prompt', 'chosen', 'rejected'])
    


  preference_prob = F.softmax(torch.tensor(reward_1-reward_2))


In [9]:
#why random sample?
#what is gradient checking, gradient acc, learning rate 
model_args = ModelConfig(LM_MODEL)
torch_dtype = (
    model_args.torch_dtype if model_args.torch_dtype in ['auto', None] else torch.float16
)


model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = torch_dtype,
    attn_implementation = model_args.attn_implementation,
    trust_remote_code = model_args.trust_remote_code,
)

model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs
) 
ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **{**model_kwargs, 'device_map':'cpu'}
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    padding_side = "left",
    trust_remote_code = model_args.trust_remote_code,
)

training_args = DPOConfig(

        gradient_checkpointing=True,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=4,
        learning_rate=5.0e-7,
        logging_steps=50,
        num_train_epochs=1,
        push_to_hub=False,  
        output_dir = "/root/autodl-tmp/.autodl/DPO_tldr",
        report_to = 'none'
    )

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=dpo_dataset_train,
    processing_class = tokenizer
)

trainer.train()


Extracting prompt in train dataset:   0%|          | 0/25000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/25000 [00:00<?, ? examples/s]

Step,Training Loss
50,0.6932
100,0.6935
150,0.6942


TrainOutput(global_step=196, training_loss=0.6917778521167989, metrics={'train_runtime': 1545.2785, 'train_samples_per_second': 16.178, 'train_steps_per_second': 0.127, 'total_flos': 0.0, 'train_loss': 0.6917778521167989, 'epoch': 1.0})

In [None]:
dpo_model = trainer.model
dpo_tokenizer = trainer.processing_class
repo_id = 'august66/qwen2-sft-dpo'
model.push_to_hub(
    repo_id=repo_id,
    tokenizer=dpo_tokenizer,
)

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/august66/qwen2-sft-dpo/commit/4263478b54061cb65a94ef1e3148e44e92c0996d', commit_message='Upload Qwen2ForCausalLM', commit_description='', oid='4263478b54061cb65a94ef1e3148e44e92c0996d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/august66/qwen2-sft-dpo', endpoint='https://huggingface.co', repo_type='model', repo_id='august66/qwen2-sft-dpo'), pr_revision=None, pr_num=None)

In [None]:
prompts_test = dataset_test['prompt']
dpo_pipe = pipeline(
    'text-generation',
    model = dpo_model,
    tokenizer = dpo_tokenizer,
)
dpo_completions_test = dpo_pipe(
    prompts_test,
    max_new_tokens = 128,
    do_sample = True,
    truncation = True,
    padding = True,
    top_p = 0.95,
    temperature = 1,
    num_return_sequences = 1,
    batch_size = 128,
)


Device set to use cuda:0
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


In [13]:
dpo_completion_test_flat= Dataset.from_list(list(np.array(dpo_completions_test).ravel()))
pipe = pipeline(
    'sentiment-analysis',
    model = SENTIMENT_MODEL,
)

dpo_sentiment_analysis_test = pipe(
    dpo_completion_test_flat['generated_text'],
    batch_size = 128,
)

Device set to use cuda:0


In [14]:
total_score = 0
for i in range(len(dpo_sentiment_analysis_test)):
    score = dpo_sentiment_analysis_test[i]['score']
    if dpo_sentiment_analysis_test[i]['label'] == 'NEGATIVE':
        score = 1 - score
    total_score += score
average_score = total_score / len(dpo_sentiment_analysis_test)

In [1]:
average_score

NameError: name 'average_score' is not defined