In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
%pip install -q -U docling

Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset, Dataset, concatenate_datasets
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from instructlab.sdg.utils.parse_and_convert import (
    create_auxiliary_dataset,
    generate_knowledge_qa_dataset,
    build_raft_dataset
)
import re
import random
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = "/new_data/knowledge_rh/quality/base_datasets"
def add_icl_examples(ds, qna, seed_example_file_name = "quality_seed_example.jsonl"):
    global BASE_DIR
    new_ds = []
    for idx, qa in enumerate(qna):
        for e in ds:
            new_ds.append(e.copy())
            new_ds[-1].update({
                f'icl_document': qa['context'],
                f'icl_query_1': qa['question_and_answers'][0]['question'], f'icl_response_1': qa['question_and_answers'][0]['answer'],
            f'icl_query_2': qa['question_and_answers'][1]['question'], f'icl_response_2': qa['question_and_answers'][1]['answer'],
            f'icl_query_3': qa['question_and_answers'][2]['question'], f'icl_response_3': qa['question_and_answers'][2]['answer'],
            })
    new_ds = Dataset.from_list(new_ds)
    print(new_ds)
    new_ds.to_json(f"{BASE_DIR}/{seed_example_file_name}", orient='records')


def _conv_pretrain(rec, tokenizer):
    if tokenizer is not None:
        rec['unmask'] = True
        return rec
    rec["messages"] = [
        {
            "role": "pretraining",
            "content": f"<|user|>\n{rec['messages'][0]['content']}\n<|assistant|>\n{rec['messages'][1]['content']}",
        }
    ]
    return rec


def chunk_document(document):
    """
    Chunk the document into chunks of ~500 tokens
    """
    DOC_SOURCE = "test.md"
    with open(DOC_SOURCE, 'w') as f:
        f.write(document)
    doc = DocumentConverter().convert(source=DOC_SOURCE).document
    MAX_TOKENS = 500
    chunker = HybridChunker(max_tokens=MAX_TOKENS, tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), merge_peers=True)
    chunk_iter = chunker.chunk(dl_doc=doc)
    document_chunks = []
    for i, chunk in enumerate(chunk_iter):
        enriched_text = chunker.serialize(chunk=chunk)
        document_chunks.append(enriched_text)
    return document_chunks

def is_string_complete(s):
    # Check if the string ends with common sentence-ending punctuation
    if s.endswith((".", "!", "?", '"', "'", "|")):
        return True

    # Check if the string ends with an incomplete word
    if re.search(r"\b\w+$", s) and not re.search(r"\b\w+\b$", s):
        return False

    return False

def filter_ds(ds):
    ds = ds.map(
        lambda x: {
            "question": x["question"].replace("[END]", "").strip(),
            "response": x["response"].replace("[END]", "").strip(),
        },
        num_proc=72,
    )
    ds = ds.filter(lambda x: is_string_complete(x["question"]), num_proc=72)
    ds = ds.filter(lambda x: is_string_complete(x["response"]), num_proc=72)
    ds = ds.filter(lambda x: '[QUESTION]' not in x['question'] and '[ANSWER]' not in x['response'])
    return ds

def create_training_mix(ds, create_summary=False, model_name=None, keep_context_separate=False, add_raft=False):
    if model_name is not None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    knowl_train = generate_knowledge_qa_dataset(ds, keep_context_separate=keep_context_separate)
    knowl_train_pretrain = knowl_train.map(_conv_pretrain, fn_kwargs={"tokenizer": tokenizer}, num_proc=10)
    if add_raft:
        knowl_train_c = generate_knowledge_qa_dataset(ds, keep_context_separate=True)
        knowledge_ds_raft = build_raft_dataset(knowl_train_c, p=0.4)
        knowledge_ds_raft = knowledge_ds_raft.add_column("unmask", [False] * len(knowledge_ds_raft))
    print(f"Using model {model_name} as chat template")
    if create_summary:
        summary_ds = create_auxiliary_dataset(ds)
        summary_ds_07 = summary_ds.map(_conv_pretrain, fn_kwargs={"tokenizer": tokenizer})
        if add_raft:
            phase07_train = concatenate_datasets([knowl_train_pretrain, summary_ds_07, knowledge_ds_raft])
        else:
            phase07_train = concatenate_datasets([knowl_train_pretrain, summary_ds_07])
    else:
        phase07_train = knowl_train_pretrain
        if add_raft:
            phase07_train = concatenate_datasets([phase07_train, knowledge_ds_raft])
    return phase07_train

def sample_per_doc(ds, final_dataset_size):
    random.seed(42)
    df = ds.to_pandas()
    per_doc_size = final_dataset_size // len(df['document'].unique()) + 1
    # First combine question and response pairs
    df['qa_pair'] = list(zip(df['question'], df['response']))

    grouped_df = df.groupby('document').agg({
        'qa_pair': lambda x: random.sample(list(x), min(per_doc_size, len(x))),
        'domain': lambda x: random.sample(list(x), min(per_doc_size, len(x))),
        'document_outline': lambda x: random.sample(list(x), min(per_doc_size, len(x))),
    }).reset_index()

    # Split the qa_pairs back into separate columns
    grouped_df['question'] = grouped_df['qa_pair'].apply(lambda x: [pair[0] for pair in x])
    grouped_df['response'] = grouped_df['qa_pair'].apply(lambda x: [pair[1] for pair in x])
    grouped_df = grouped_df.drop('qa_pair', axis=1)

    # Convert back to grouped_df format
    df_subset = grouped_df.explode(['question', 'response']).reset_index(drop=True)
    return Dataset.from_pandas(df_subset)

#### Entigraph

In [3]:

ds_1 = load_dataset('json', data_dir="/new_data/knowledge_rh/quality/entigraph_knowledge1.0_phi4_first_24_n_5/gen/", split="train")
ds_1 = ds_1.filter(lambda x: x['score'] == '2' and x['judgment'] == 'YES')
ds_1 = filter_ds(ds_1)
ds_1 = ds_1.add_column('domain', ["articles"]*ds_1.num_rows)
ds_1 = ds_1.add_column('raw_document', ds_1['document'])
ds_1 = create_training_mix(ds_1, create_summary=False, model_name="meta-llama/Meta-Llama-3-8B-Instruct")
ds_1  = ds_1.remove_columns(['metadata', 'id'])
ds_1 = ds_1.shuffle(seed=42)
ds_1_25_percent = ds_1.select(range(int(len(ds_1) * 0.25)))
ds_1_50_percent = ds_1.select(range(int(len(ds_1) * 0.5)))
ds_1_100_percent = ds_1
print(ds_1_25_percent)
print(ds_1_50_percent)
print(ds_1_100_percent)
ds_1_25_percent.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_25_percent.jsonl", orient='records', lines=True)
ds_1_50_percent.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_50_percent.jsonl", orient='records', lines=True)
ds_1_100_percent.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_100_percent.jsonl", orient='records', lines=True)



ultra_chat_ds = load_dataset(
        "HuggingFaceH4/ultrachat_200k",
        trust_remote_code=True,
        split="train_sft"
    )
ultra_chat_ds = ultra_chat_ds.remove_columns(['prompt', 'prompt_id']).add_column('unmask', [False]*ultra_chat_ds.num_rows)
ultra_chat_ds_25_percent = ultra_chat_ds.shuffle(seed=42).select(range(len(ds_1_25_percent)))
ultra_chat_ds_50_percent = ultra_chat_ds.shuffle(seed=42).select(range(len(ds_1_50_percent)))
ultra_chat_ds_100_percent = ultra_chat_ds.shuffle(seed=42).select(range(len(ds_1_100_percent)))

ds = concatenate_datasets([ds_1_25_percent, ultra_chat_ds_25_percent])
ds.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_25_percent_ultra_chat.jsonl", orient='records', lines=True)

ds = concatenate_datasets([ds_1_50_percent, ultra_chat_ds_50_percent])
ds.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_50_percent_ultra_chat.jsonl", orient='records', lines=True)

ds = concatenate_datasets([ds_1_100_percent, ultra_chat_ds_100_percent])
ds.to_json(f"/new_data/knowledge_rh/quality/training_mix/entigraph_knowledge1.0_phi4_first_24_n_5_100_percent_ultra_chat.jsonl", orient='records', lines=True)

Using model meta-llama/Meta-Llama-3-8B-Instruct as chat template
Dataset({
    features: ['messages', 'unmask'],
    num_rows: 40704
})
Dataset({
    features: ['messages', 'unmask'],
    num_rows: 81409
})
Dataset({
    features: ['messages', 'unmask'],
    num_rows: 162818
})


Creating json from Arrow format: 100%|██████████| 41/41 [00:01<00:00, 29.58ba/s]
Creating json from Arrow format: 100%|██████████| 82/82 [00:02<00:00, 36.53ba/s]
Creating json from Arrow format: 100%|██████████| 163/163 [00:04<00:00, 36.55ba/s]
Creating json from Arrow format: 100%|██████████| 82/82 [00:02<00:00, 28.52ba/s]
Creating json from Arrow format: 100%|██████████| 163/163 [00:06<00:00, 24.06ba/s]
Creating json from Arrow format: 100%|██████████| 326/326 [00:11<00:00, 28.93ba/s]


1647092578