In [1]:
from datasets import load_dataset
from transformers import pipeline
import torch
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2023-09-08 22:29:09.331202: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
class AnthropicDataset(torch.utils.data.Dataset):
    def __init__(self, split, item_to_ret='prompt'):
        self.hf_dataset = load_dataset('Anthropic/hh-rlhf', split=split)
        self.prompts, self.chosen, self.reject = [], [], []
        for d in (self.hf_dataset):
            try:
                prompt, chosen, reject = self.process_response(d)
                self.prompts.append(prompt)
                self.chosen.append(chosen)
                self.reject.append(reject)
            except:
                continue
        self.set_dataset_type(item_to_ret)

    def set_dataset_type(self, item_to_ret):
        if item_to_ret == 'prompt':
            self.dataset = self.prompts
        elif item_to_ret == 'chosen':
            self.dataset = self.chosen
        elif item_to_ret == 'rejected':
            self.dataset = self.reject
        else:
            self.dataset = list(zip(self.prompts, self.chosen, self.reject))

    def process_response(self, x):
        chosen = x['chosen']
        reject = x['rejected']
        ind = chosen.rfind('\n\nAssistant:')
        prompt = chosen[:ind].strip()
        assert reject[:len(prompt)] == prompt
        chosen = chosen[ind + len('\n\nAssistant:'):].strip()
        reject = reject[ind + len('\n\nAssistant:'):].strip()
        return prompt, chosen, reject


    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        return self.dataset[i]

In [3]:
class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, split, item_to_ret='prompt'):
        self.hf_dataset = load_dataset('openai/summarize_from_feedback', 'comparisons', split=split)
        self.prompts, self.chosen, self.reject = [], [], []
        for d in (self.hf_dataset):
            try:
                prompt, chosen, reject = self.process_response(d)
                self.prompts.append(prompt)
                self.chosen.append(chosen)
                self.reject.append(reject)
            except:
                continue
        self.set_dataset_type(item_to_ret)

    def set_dataset_type(self, item_to_ret):
        if item_to_ret == 'prompt':
            self.dataset = self.prompts
        elif item_to_ret == 'chosen':
            self.dataset = self.chosen
        elif item_to_ret == 'rejected':
            self.dataset = self.reject
        else:
            self.dataset = list(zip(self.prompts, self.chosen, self.reject))


    def process_response(self, x):
        prompt = x['info']['post'].strip()
        first = x['summaries'][0]['text'].strip()
        second = x['summaries'][1]['text'].strip()
        choice = x['choice']
        if choice == 0:
            return prompt, first, second
        elif choice == 1:
            return prompt, second, first
        else:
            assert False
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        return self.dataset[i]

In [4]:
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.og_dataset = dataset
        self.sentences = []
        self.sentence_to_prompt = []
        for i,x in enumerate(self.og_dataset):
            sents = self.break_sentences(x)
            self.sentences.extend(sents)
            self.sentence_to_prompt.extend([i] * len(sents))
        assert len(self.sentence_to_prompt) == len(self.sentences)

    def break_sentences(self, x):
        return sent_tokenize(x)

    def __len__(self):
        return len(self.sentences)
        
    def __getitem__(self, i):
        return self.sentences[i]

In [5]:
def get_deberta_test_datasets():
    dataset = {}
    for t in ['prompt', 'chosen', 'rejected']:
        sub_dataset = {}
        sub_dataset['openai/summarize_from_feedback'] = SummaryDataset('validation', item_to_ret=t)
        # sub_dataset = load_dataset('openai/webgpt_comparisons')
        # sub_dataset load_dataset('Dahoas/synthetic-instruct-gptj-pairwise')
        #sub_dataset['Anthropic/hh-rlhf'] = load_dataset('Anthropic/hh-rlhf', split='test', item_to_ret=t)
        for k,v in sub_dataset.items():
            sub_dataset[k] = SentenceDataset(v)
        dataset[t] = sub_dataset
    return dataset

In [8]:
datasets = get_deberta_test_datasets()

Found cached dataset summarize_from_feedback (/home/ubuntu/.cache/huggingface/datasets/openai___summarize_from_feedback/comparisons/0.0.0/483f970ceb55b926b0a087ef4f678ab1b089bc8174a107a452c6152e88af7ff0)
Found cached dataset summarize_from_feedback (/home/ubuntu/.cache/huggingface/datasets/openai___summarize_from_feedback/comparisons/0.0.0/483f970ceb55b926b0a087ef4f678ab1b089bc8174a107a452c6152e88af7ff0)
Found cached dataset summarize_from_feedback (/home/ubuntu/.cache/huggingface/datasets/openai___summarize_from_feedback/comparisons/0.0.0/483f970ceb55b926b0a087ef4f678ab1b089bc8174a107a452c6152e88af7ff0)


In [9]:
print(len(datasets))
for k,vs in datasets.items():
    print(k)
    for k_,v in vs.items():
        print(k_, len(v))

3
prompt
openai/summarize_from_feedback 1141862
chosen
openai/summarize_from_feedback 172862
rejected
openai/summarize_from_feedback 162461


In [14]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "Helsinki-NLP/opus-mt-en-zh"
#tokenizer_translate = AutoTokenizer.from_pretrained(model_name)
#model_translate = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe_translate = pipeline("translation", model=model_name, device_map="auto")

TypeError: ('Keyword argument not understood:', 'device_map')

In [11]:
batch_size = 256
all_result_types = {}
for text_type,sub_dataset in datasets.items():
    results_sub = {}
    for model_name, dataset in sub_dataset.items():
        all_results = []
        for out in tqdm(pipe_translate(dataset, batch_size=batch_size), total=len(dataset)):
            #assert len(out) == 1
            all_results.append(out[0]['translation_text'])
        results_sub[model_name] = all_results
    all_result_types[text_type] = results_sub

  0%|          | 0/1141862 [00:01<?, ?it/s]


AssertionError: 