In [None]:
from datasets import Dataset, load_dataset
import datasets
import pandas as pd
from datasets import Dataset
import random
import numpy as np
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer
import torch
from transformers import BitsAndBytesConfig
from torch import nn
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
import torch.nn.functional as F

In [None]:
batch_size = 1
model_name = "humarin/chatgpt_paraphraser_on_T5_base"
args = Seq2SeqTrainingArguments(
    output_dir = "model_with_data",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    predict_with_generate=True,
    logging_strategy="steps",
    logging_steps = 14869,
    fp16=True,
)

In [None]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
ref_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import re, string, pickle
#re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return re_tok.sub(r' \1 ', s).split()

class Judge():
    def __init__(self):
        with open('judge_model/tfidf_vectorizer.pkl', 'rb') as file:
            self.vec = pickle.load(file)
        with open('judge_model/logistic_regression_model.pkl', 'rb') as file:
            self.model = pickle.load(file)
        with open('judge_model/r_values.pkl', 'rb') as file:
            self.r = pickle.load(file)
            
    def judge(self, words):
        tokens = self.vec.transform(words)
        preds = self.model.predict_proba(tokens.multiply(self.r))[:,1]
        
        return 1 if preds>0.5 else 0

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
dataset = load_dataset('csv', data_files='dirty.csv', split='train')

In [None]:
def preprocess_function(examples):
    text = examples['comment_text']
    examples['input_ids'] = tokenizer("paraphrase: " + text, max_length=128, truncation=True)['input_ids']
    # example['labels'] = tokenizer(text_target=neutral1, max_length=max_target_length, truncation=True)['input_ids']
    # new_examples['labels'].append(labels['input_ids'])
    return examples

dataset = dataset.map(preprocess_function, remove_columns=['id','comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate'])

In [None]:
print(dataset[0])

In [None]:
class MyTrainer(Seq2SeqTrainer):

    def __init__(self,
                model,
                ref_model,
                args,
                judge=None,
                train_dataset=None,
                eval_dataset=None,
                data_collator=None,
                tokenizer=None):
        super().__init__(model,
                args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                data_collator=data_collator,
                tokenizer=tokenizer)
        self.judge = judge
        self.total_true = 0
        self.ref_model = ref_model.to(model.device)
    
    def toxic_score(self, model, sent, w_answer, w_label):
        logits = model(input_ids=sent, decoder_input_ids=w_answer).logits
        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=w_label.unsqueeze(2)).squeeze(2)

        return per_token_logps.sum(-1)
        
    def clean_score(self, model, sent, c_answer, c_label):
        logits = model(input_ids=sent, decoder_input_ids=c_answer).logits
        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=c_label.unsqueeze(2)).squeeze(2)
        return per_token_logps.sum(-1)
    
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.

        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)
        
        new_inputs = model.generate(inputs.input_ids, max_new_tokens=20)
        words = self.tokenizer.batch_decode(new_inputs, skip_special_tokens=True)
        result = self.judge.judge(words)
        labels = new_inputs.clone()
        if random.random() >=0.9:
            result = result^1
        if result == 1:
            # this works for only toxic words
            policy_rejected_logps = self.toxic_score(model, inputs.input_ids, new_inputs, labels) # original sentence, wrong answer, wrong labels
            reference_rejected_logps = self.toxic_score(self.ref_model, inputs.input_ids, new_inputs, labels) 
            
            pi_logratios = 0 - policy_rejected_logps
            ref_logratios = 0 - reference_rejected_logps
            logits = pi_logratios - ref_logratios

            loss = (
                - F.logsigmoid(0.1 * logits) * (1 - 0.1)
                - F.logsigmoid(-0.1 * logits) * 0.1
            ).mean()
        elif result == 0:
            # this works for clean words
            self.total_true += 1
            policy_chosen_logps = self.clean_score(model, inputs.input_ids, new_inputs, labels) # original sentence, correct answer, correct labels
            policy_rejected_logps = self.toxic_score(model, inputs.input_ids, inputs.input_ids.clone(), inputs.input_ids.clone()) #original sentence, wrong answer, wrong labels
            reference_chosen_logps = self.clean_score(self.ref_model, inputs.input_ids, new_inputs, labels) # original sentence, correct answer, correct labels
            reference_rejected_logps = self.toxic_score(self.ref_model, inputs.input_ids, inputs.input_ids.clone(), inputs.input_ids.clone()) #original sentence, wrong answer, wrong labels    
            
            pi_logratios = policy_chosen_logps - policy_rejected_logps
            ref_logratios = reference_chosen_logps - reference_rejected_logps

            logits = pi_logratios - ref_logratios
            
            loss = (
                - F.logsigmoid(0.1 * logits) * (1 - 0.1)
                - F.logsigmoid(-0.1 * logits) * 0.1
            ).mean()
            
        self.accelerator.backward(loss)

        return loss.detach() / self.args.gradient_accumulation_steps
    
    def log(self, logs: Dict[str, float]) -> None:
        """
        Log `logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)
        print('number of clean words: ', self.total_true)
        output = {**logs, **{"step": self.state.global_step}}
        self.state.log_history.append(output)
        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)

In [None]:
judge = Judge()


In [None]:
trainer = MyTrainer(
    model,
    ref_model,
    args,
    judge = judge,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

# 8330 16613 25134 33497 41799 50192 58576 67011 75420 83766 92114 100498 108912 117328 125750
# 8330 8230  8521  8363  8302  8393  8384  8435  8409  8346  8348  8384   8416   8416   8422

In [None]:
words = "you are a stupid fuck your mother's cunt stinks"
token = tokenizer("paraphrase: " + words, return_tensors="pt")

new_inputs = model.generate(token.input_ids.to(model.device))
print(tokenizer.batch_decode(new_inputs, skip_special_tokens=True))

In [None]:
trainer.save_model("model_with_dpo_90")

In [6]:
test = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')

In [17]:
print(dataset)

Dataset({
    features: ['input_ids'],
    num_rows: 14869
})


In [None]:
test = AutoModelForSeq2SeqLM.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base').to('cuda')

count = 0
for i in range(len(dataset)):
    new_inputs = test.generate(torch.tensor([dataset[i]['input_ids']]).to(test.device))
    words = tokenizer.batch_decode(new_inputs, skip_special_tokens=True)
    result = judge.judge(words)
    if result==0:
        count += 1
        
print(count)

10965


In [11]:
my_model = AutoModelForSeq2SeqLM.from_pretrained('model_with_dpo_second').to('cuda')

count = 0
for i in range(len(dataset)):
    new_inputs = my_model.generate(torch.tensor([dataset[i]['input_ids']], max_new_tokens=20).to(my_model.device))
    words = tokenizer.batch_decode(new_inputs, skip_special_tokens=True)
    result = judge.judge(words)
    if result==0:
        count += 1
        
print(count)

TypeError: tensor() got an unexpected keyword argument 'max_new_tokens'

In [12]:
import random

print(random.random())

0.5124784349816838


In [15]:
a = 0
print(a^1)

1
