In [1]:
from rich import print
import os
from openai import OpenAI
from datasets import Dataset, DatasetDict, load_dataset
import json
from dotenv import load_dotenv
import time
from tenacity import retry, stop_after_attempt, wait_exponential

# Load environment variables from .venv file
load_dotenv() 

  from .autonotebook import tqdm as notebook_tqdm


True

In [35]:
topic = "Docker"
n_subtopics = 5
n_instructions = 100


client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.environ["NVIDIA_API_KEY"]
)

# 1. Subtopics Generation

In [36]:
TOPIC_GENERATION_PROMPT_TEMPLATE = """\
I want to create a dataset of Docker commands. Based on this, generate a list of {n_subtopics} subtopics to cover what needs to be covered when working with Docker.

The topic is: {topic}

The list must be without numbers, and without any description of the subtopics. The subtopics should be separated by a comma. There must be no other text than the list.
"""

def generate_subtopics(client, topic, n_subtopics):
    prompt = TOPIC_GENERATION_PROMPT_TEMPLATE.format(topic=topic, n_subtopics=n_subtopics)
    response = client.chat.completions.create(
        model="meta/llama-3.1-405b-instruct",
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    return response

response = generate_subtopics(client, topic=topic, n_subtopics=n_subtopics)
print(response.choices[0].message.content)

# 2. Questions Generation

In [37]:
QUESTION_PROMPT_TEMPLATE = """\
To create a dataset of Docker commands. Given a topic in Docker, generate {n_instructions} instructions that could be asked about that topic. Your response should be in a list format.

The topic is: {sub_topic}

The list must be without numbers. The questions/instructions should be separated by a newline character. There must be no other text than the list.
"""
subtopic_list = response.choices[0].message.content.split(",")
def generate_questions(client, sub_topic, n_instructions):
    prompt = QUESTION_PROMPT_TEMPLATE.format(sub_topic=sub_topic, n_instructions=n_instructions)
    response = client.chat.completions.create(
        model="meta/llama-3.1-405b-instruct",
        messages=[
            {"role": "user",
             "content": prompt}
        ],
        temperature=0.2,
        top_p=0.7,
        max_tokens=1024,
    )
    return response.choices[0].message.content

def instruction_generator(client, subtopic_list, n_instructions):
    instruction_list = [generate_questions(client, subtopic, n_instructions) for subtopic in subtopic_list]
    return instruction_list

instruction_list = instruction_generator(client, subtopic_list, n_instructions)

instruction_list_formatted = []
for instruction_set in instruction_list:
    instruction_list_formatted.extend([question.strip() for question in instruction_set.split("\n") if question])
len(instruction_list_formatted)

482

# 3. Response Generation

In [38]:
RESPONSE_PROMPT_TEMPLATE = """\
Given a question/instruction about Docker, generate response that could be given.

The user prompt is: {instruction}

"""
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def generate_response(client, instruction):
    prompt = RESPONSE_PROMPT_TEMPLATE.format(instruction=instruction)
    try:
        response = client.chat.completions.create(
            model="meta/llama-3.1-405b-instruct",
            messages=[
                {"role": "user",
                "content": prompt}
            ],
            temperature=0.2,
            top_p=0.7,
            max_tokens=1024,
        )
        print(response.choices[0].message.content)
        return response.choices[0].message.content
    except Exception as e:
        if "429" in str(e):
            print(f"Rate limit hit, waiting before retry...")
            time.sleep(5)  # Add extra delay for rate limits
        raise e

def response_generator(client, instruction_list):
    tasks = [generate_response(client, instruction) for instruction in instruction_list]
    response_list = tasks
    return response_list

instruction_response_list = response_generator(client, instruction_list_formatted)
instruction_response_pair_list = []
for instruction, response in zip(instruction_list_formatted, instruction_response_list):
    try:
        instruction_response_pair_list.append(
            {
                "instruction": instruction,
                "response": response
            }
        )
    except Exception as e:
        if "429" in str(e):
            print(f"Rate limit hit, waiting before retry...")
            time.sleep(5)  # Add extra delay for rate limits
        raise e

In [39]:
with open('synthetic_data.jsonl', 'w') as f:
    for item in instruction_response_pair_list:
        f.write(json.dumps(item))
        f.write('\n')

In [40]:
messages = [
    {
        "role": "user",
        "content": "What is the chemical formula for water?"
    },
    {
        "role": "assistant",
        "content": "The chemical formula for water is H₂O."
    },
]

response = client.chat.completions.create(
    model="nvidia/nemotron-4-340b-reward",
    messages=messages,
)
print(response.choices[0].logprobs.content)

In [41]:
def get_scores_from_response(openai_response_template):
    logprobs = openai_response_template.choices[0].logprobs.content
    score_dict = {}
    for score in logprobs:
        score_dict[score.token] = score.logprob
    return score_dict

print(get_scores_from_response(response))

In [42]:
def get_response_and_scores(client, model, question, response_content):
    messages = [
        {
            "role": "user",
            "content": question
        },
        {
            "role": "assistant",
            "content": response_content
        },
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )

    scores = get_scores_from_response(response)
    return scores

In [43]:
def parse_synthetic_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_entry = json.loads(line)
            data.append({
                    "instruction": json_entry["instruction"],
                    "response": json_entry["response"]
                })
    return data

synthetic_data = parse_synthetic_data("synthetic_data.jsonl")
len(synthetic_data)

482

In [44]:
def process_instruction_response_pairs(client, model, synthetic_data):
    scores = []
    for instruction_response_pair in synthetic_data:
        instruction = instruction_response_pair["instruction"]
        response = instruction_response_pair['response']
        scores.append(get_response_and_scores(client, model, instruction, response))
    return scores


scores = process_instruction_response_pairs(client, "nvidia/nemotron-4-340b-reward", synthetic_data)

helpfulness_THRESHOLD = 3
verbosity_THRESHOLD = 2.5
synthetic_data = [data for i, data in enumerate(synthetic_data) 
                  if not (scores[i]["helpfulness"] < helpfulness_THRESHOLD or 
                          scores[i]["verbosity"] > verbosity_THRESHOLD)]
len(synthetic_data)

339

In [45]:
with open('synthetic_data_filtered.jsonl', 'w') as f:
    for item in synthetic_data:
        f.write(json.dumps(item))
        f.write('\n')

In [4]:
with open(f'synthetic_data_filtered.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]
dataset = Dataset.from_list(data)
dataset_dict = DatasetDict({"train": dataset})
dataset_dict.push_to_hub("Shafagh99/docker-prompt")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 200.06ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Shafagh99/docker-prompt/commit/0802d58693ec6d0d8a4bbdeb57410f8bbffe0d7a', commit_message='Upload dataset', commit_description='', oid='0802d58693ec6d0d8a4bbdeb57410f8bbffe0d7a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Shafagh99/docker-prompt', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Shafagh99/docker-prompt'), pr_revision=None, pr_num=None)