In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../src')
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

# Use DEITA Pipeline on UltraFBK dataset


1. reformat `ultrafbk` to be in the `sharegpt` format (quite minimal work)
2. pipe into DEITA pipeline

In [2]:
from datasets import load_dataset

ultrafbk_dataset = load_dataset(
    "when2rl/UltraFeedback_binarized_cleaned_annotated",
    split='train_prefs'
)

In [3]:
import hashlib


def reformat_to_sharegpt(data_dict):
    old_conversations = data_dict['messages']
    conversations = []
    for i, turn in enumerate(old_conversations):
        role = turn['role']
        content = turn['content']
        # sanity checks
        if i % 2 == 0 and role != "user":
            raise ValueError(f"Expected user role, got {role}")
        if i % 2 == 1 and role != "assistant":
            raise ValueError(f"Expected assistant role, got {role}")
        # append
        conversations.append({
            'from': "human" if role == "user" else "gpt",
            'value': content
        })
    data_dict['conversations'] = conversations

    ### add full id
    text_chosen = data_dict['chosen']
    text_rejected = data_dict['rejected']
    full_encoded = f"{text_chosen} {text_rejected}"
    full_encoded_id = hashlib.sha256(full_encoded.encode("utf-8")).hexdigest()
    data_dict['full_id'] = full_encoded_id
    
    return data_dict

In [4]:
ultrafbk_dataset_reformatted = ultrafbk_dataset.map(
    reformat_to_sharegpt,
    num_proc=8,
    keep_in_memory=True,
    desc="Reformatting to ShareGPT format"
)
ultrafbk_dataset_reformatted = ultrafbk_dataset_reformatted.select_columns(
    ['conversations', 'full_id', 'prompt_id']
)

Reformatting to ShareGPT format (num_proc=8):   0%|          | 0/61135 [00:00<?, ? examples/s]

In [5]:
ultrafbk_dataset_reformatted

Dataset({
    features: ['conversations', 'full_id', 'prompt_id'],
    num_rows: 61135
})

In [6]:
tmp_df = ultrafbk_dataset_reformatted.select(range(500)).to_pandas()
tmp_df.to_json("../data/deita/tmp.json", orient="records", lines=True)

In [7]:
from deita.pipeline import Pipeline

In [8]:
data_path = "../data/deita/tmp.json"
complexity_scored_data_path = "../data/deita/tmp_complexity.json"
embedding_path = "../data/deita/tmp_embeddings.pkl"
output_path = "../data/deita/tmp_deita.json"

In [None]:
pipeline = Pipeline(
    "score_pipeline",
    data_path = data_path,   # json file with sharegpt format
    scorer = "mistral",   # [mistral, llama]
    scorer_name_or_path = "hkust-nlp/deita-complexity-scorer",  # scorer name or path e.g. hkust-nlp/deita-complexity-scorer
    is_vllm = False,  # launch with vllm [True, False]
    score_type = ["complexity", "quality"], # [complexity, quality]
    output_path = complexity_scored_data_path
)  # output path (json format)

pipeline.run()

In [9]:
"""
parser.add_argument("--data_path", type=str, default=None)
parser.add_argument("--output_path", type=str, default=None)
parser.add_argument("--max_length", type=int, default=2048)
parser.add_argument("--batch_size_per_device", type=int, default=4)
parser.add_argument("--conv_template", type=str, default="vicuna_v1.1")
parser.add_argument("--use_flash_attention", type=bool, default=False)
parser.add_argument("--only_answer", type=bool, default=False)
parser.add_argument("--random_shuffle", type=bool, default=False)
parser.add_argument("--model_name_or_path", type=str, default="mistralai/Mistral-7B-v0.1")
"""

embed_pipeline = Pipeline(
    "embed_pipeline", 
    data_path = data_path,   # json file with sharegpt format
    output_path = embedding_path,  # output path (pickle format)
    model_name_or_path = "mistralai/Mistral-7B-v0.1",  # model name or path e.g. mistralai/Mistral-7B-v0.1
    max_length = 2048,
    use_flash_attention = True,
    batch_size_per_device = 1,
    conv_template = "vicuna_v1.1",
    only_answer = False,
    random_shuffle = False,
    bfloat16 = True
)

embed_pipeline.run()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizing and reformatting instruction data (num_proc=32):   0%|          | 0/500 [00:00<?, ? examples/s]

input_ids torch.Size([16, 1038])
input_ids torch.Size([16, 515])
input_idsinput_ids torch.Size([16, 525])
 torch.Size([16, 1249])
input_ids torch.Size([16, 1657])
input_ids input_idstorch.Size([16, 1445])
 torch.Size([16, 705])
input_ids input_idstorch.Size([16, 1857])
 torch.Size([16, 894])
input_ids torch.Size([16, 695])
input_ids torch.Size([16, 813])
input_ids torch.Size([16, 587])
input_ids torch.Size([16, 1066])
input_ids torch.Size([16, 786])
input_ids
 input_idstorch.Size([16, 1811])
 torch.Size([16, 923])input_ids torch.Size([16, 928])
input_idsinput_ids torch.Size([16, 1427])
 torch.Size([16, 993])
input_ids torch.Size([16, 1430])
input_ids
 torch.Size([15, 1657])input_ids torch.Size([15, 1246])
input_idsinput_ids torch.Size([15, 810]) 
torch.Size([15, 2048])
input_ids torch.Size([15, 1681])
input_ids torch.Size([15, 1249])
input_ids input_idstorch.Size([15, 2048])
 torch.Size([15, 1099])
input_ids torch.Size([15, 1195])
input_ids torch.Size([15, 1832])
input_ids torch.Size([

100%|██████████| 500/500 [01:03<00:00,  7.92it/s]


In [10]:
filter_pipeline = Pipeline(
    "filter_pipeline", 
    data_path = data_path,  # json file with sharegpt format
    other_data_path = embedding_path,  # embedding file path (pickle format)
    threshold = 0.9,  # filter threshold default: 0.9 
    data_size = 100,  # size of selected data
    chunk_size = 100000,  # used for more efficient GPU computing  default: 100000
    sort_key = "complexity_scores,quality_scores",  # default: "complexity_scores,quality_scores"
    output_path = output_path,  # json format output path
    distance_metric = "cosine",  # default: cosine
    embedding_field = "embedding",  # default: embedding
    is_compression = False,  # default: False
    device = 0  # GPU IDX, default: 0
)

filter_pipeline.run()

KeyError: 'complexity_scores'