#### Visualize the existing dataset

In [1]:
import datasets    

dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')

train_dataset = dataset['train']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 79168 examples [00:00, 3426281.64 examples/s]
Generating dev split: 8757 examples [00:00, 2303802.30 examples/s]
Generating test split: 3610 examples [00:00, 979964.89 examples/s]


In [2]:
# Print sample examples from each dataset
print("Sample examples from the training dataset:")
for i, example in enumerate(train_dataset):
    if i >= 5:  # Print first 5 examples
        break
    print(example)

print("\nSample examples from the test dataset:")
for i, example in enumerate(test_dataset):
    if i >= 5:  # Print first 5 examples
        break
    print(example)

Sample examples from the training dataset:
{'id': 'train_0', 'question': 'total number of death row inmates in the us', 'golden_answers': ['2,718']}
{'id': 'train_1', 'question': 'big little lies season 2 how many episodes', 'golden_answers': ['seven']}
{'id': 'train_2', 'question': 'who sang waiting for a girl like you', 'golden_answers': ['Foreigner']}
{'id': 'train_3', 'question': 'where do you cross the arctic circle in norway', 'golden_answers': ['Saltfjellet']}
{'id': 'train_4', 'question': 'who is the main character in green eggs and ham', 'golden_answers': ['Sam-I-am']}

Sample examples from the test dataset:
{'id': 'test_0', 'question': 'who got the first nobel prize in physics', 'golden_answers': ['Wilhelm Conrad Röntgen']}
{'id': 'test_1', 'question': 'when is the next deadpool movie being released', 'golden_answers': ['May 18, 2018']}
{'id': 'test_2', 'question': 'which mode is used for short wave broadcast service', 'golden_answers': ['Olivia', 'MFSK']}
{'id': 'test_3', 'q

## Load our CPQ D2 dataset
https://orahub.oci.oraclecorp.com/cx-cnap/agent-dbt-mlops/-/blob/dbt/seeds/all_combinations.json?ref_type=heads has the input output pairs of the user request and the product configuration output, with labels for `correct` and `incorrect`.

We only need the `correct` pairs for training in this scenario.

In [None]:
import json
from datasets import Dataset

### Edit this path to point to the data source
with open("../datasets/all_combinations.json") as f:
    raw = json.load(f)

# Extract the actual data list
data = raw["pairs"]

# Create a Dataset object
dataset = Dataset.from_list(data)
print(len(dataset))

# Get only correct labels
correct_dataset = dataset.filter(lambda x: x["label"] == "correct")

378


Filter: 100%|██████████| 378/378 [00:00<00:00, 69139.89 examples/s]


In [14]:
print(len(correct_dataset))
print(dataset[1])
print(dataset[5]['output']['features'])

122
{'input': 'What is the total cost of a High-Performance Compute Cluster with 2 GPU units, 256GB DDR4 memory and 1TB SSD storage?', 'output': {'compute_units': ['CPU', 'GPU'], 'features': [], 'memory': '128GB DDR4', 'package': 'Standard', 'product': 'High-Performance Compute Cluster', 'storage': '2TB NVMe'}, 'label': 'incorrect'}
['Secure Boot']


#### Split into Train and Test

In [16]:
# -- Shuffle and split into train/test
import random

TEST_SPLIT_RATIO = 0.2
SEED = 42


random.seed(SEED)
indices = list(range(len(dataset)))
random.shuffle(indices)


split_index = int(len(indices) * (1 - TEST_SPLIT_RATIO))
train_indices = indices[:split_index]
test_indices = indices[split_index:]

train_dataset = dataset.select(train_indices)
test_dataset = dataset.select(test_indices)


In [17]:
print(len(train_dataset))
print(len(test_dataset))

302
76


### Reformat

#### Prompt Template

In [29]:
def make_prefix(dp, template_type):
    question = dp['input']

    # NOTE: also need to change reward_score/countdown.py
    if template_type == 'base':
        """This works for any base model"""
        prefix = f"""Answer the given question. \
You must conduct reasoning inside <think> and </think> first every time you get new information. \
After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. \
You can search as many times as your want. \
If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: {question}\n"""
    else:
        raise NotImplementedError
    return prefix

In [32]:
# add a row to each data item that represents a unique id
data_source = 'cpq'

def make_map_fn(split):

    def process_fn(example, idx):
        # print(example)
        example['input'] = example['input'].strip()
        # if example['input'][-1] != '?':
        #     example['input'] += '?'
        question = make_prefix(example, template_type="base")
        solution = {
            "target": example['output'],
        }

        data = {
            "data_source": data_source,
            "prompt": [{
                "role": "user",
                "content": question,
            }],
            "ability": "fact-reasoning",
            "reward_model": {
                "style": "rule",
                "ground_truth": solution
            },
            "extra_info": {
                'split': split,
                'index': idx,
            }
        }
        return data

    return process_fn

In [34]:
import os


train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)

local_dir = "../data/cpq_search"

train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))

Map: 100%|██████████| 302/302 [00:00<00:00, 8649.05 examples/s]
Map: 100%|██████████| 76/76 [00:00<00:00, 6724.62 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 620.37ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 932.69ba/s]


82856

## Knowledge Corpus

Source: https://orahub.oci.oraclecorp.com/cx-cnap/agent-dbt-mlops/-/blob/dbt/seeds/compute_config.json

In [None]:
import json

with open("../datasets/compute_config.json", "r") as f:
    data = json.load(f)

output = []
id_counter = 0

for product in data["compute_resource_configurations"]:
    base_price = product["base_price"]
    product_name = product["product"]

    for package in product["packages"]:
        title = f"{product_name} - {package['name']}"
        lines = [
            f"Base Price: ${base_price}",
            f"Package Modifier: ${package['price_modifier']}"
        ]

        # Compute units
        compute_texts = []
        for unit in package.get("compute_units", []):
            unit_type = unit["type"]
            options = ", ".join([f"{opt['model']} (${opt['price_modifier']})" for opt in unit["options"]])
            compute_texts.append(f"- {unit_type}: {options}")
        if compute_texts:
            lines.append("Compute Units:\n" + "\n".join(compute_texts))

        # Memory
        if "memory_options" in package:
            memory = ", ".join([f"{m['size']} (${m['price_modifier']})" for m in package["memory_options"]])
            lines.append(f"Memory Options: {memory}")

        # Storage
        if "storage_options" in package:
            storage = ", ".join([f"{s['type']} (${s['price_modifier']})" for s in package["storage_options"]])
            lines.append(f"Storage Options: {storage}")

        # Features
        if "features" in package:
            features = ", ".join([f"{f['feature']} (${f['price_modifier']})" for f in package["features"]])
            lines.append(f"Features: {features}")

        text = "\n".join(lines)
        output.append({
            "id": str(id_counter),
            "contents": f"{title}\n{text}"
        })
        id_counter += 1

# Write to JSONL
with open("../data/cpq_knowledge_corpus.jsonl", "w") as f:
    for item in output:
        f.write(json.dumps(item) + "\n")
