In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
```bash
git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
pip install -e .
pip install rich datasets
```

In [None]:
from datasets import load_dataset, Dataset
from openai import OpenAI

from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.registry import PromptRegistry

### Host and Connect to llama3.3 teacher model
- Host your teacher model using vllm

    ```bash
    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m vllm.entrypoints.openai.api_server \
        --model meta-llama/Llama-3.3-70B-Instruct \
        --dtype float16 \
        --tensor-parallel-size 8 
    ```

- Use OpenAI API to connect to the model

In [None]:
endpoint = f"http://localhost:8000/v1"
openai_api_key = "EMPTY"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

### Register llama3.3 model prompt in prompt registry

In [None]:
# How to get the chat template for prompt registry
# Once you get that add new prompt template in src/instructlab/sdg/prompts.py
# Then add the model name in src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_llama3.3.yaml
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
@PromptRegistry.register("meta-llama/Llama-3.3-70B-Instruct")
def llama_3_3_70b_chat_template():
    return tokenizer.chat_template

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [4]:
flow_cfg = Flow(client).get_flow_from_file("synth_knowledge1.5_llama3.3.yaml")
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [27]:
seed_data_path = "Your seed data path"
ds = load_dataset('json', data_files=seed_data_path, split='train')
ds = ds.select(range(1))

In [None]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

### Host and connect to Mixtral model

```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Llama-3.3-70B-Instruct \
    --dtype float16 \
    --tensor-parallel-size 8 
```

In [None]:
# Vllm server is running on 10.7.0.15
mistral_client = OpenAI(
    api_key="EMPTY",
    base_url=f"http://10.7.0.15:8000/v1",
)
mistral_client_teacher_model = mistral_client.models.list().data[0].id
print(mistral_client_teacher_model)

In [21]:
# Create flow with mistral config
flow_cfg_mistral = Flow(mistral_client).get_flow_from_file("../../src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml")
sdg_mistral = SDG(
    [Pipeline(flow_cfg_mistral)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [None]:
generated_data_mistral = sdg_mistral.generate(ds, checkpoint_dir="Tmp")

### Compare the generated data from both models

In [None]:
# Print the first row of generated data and mistral generated data
k = 5  # Number of examples to dump
output_file = "model_comparison.md"
with open(output_file, "w") as f:
    f.write(f"### Document \n{generated_data[0]['document']}")
    for i in range(min(len(generated_data), len(generated_data_mistral))):
        f.write("Example #{}\n".format(i+1))
        f.write("### Result from llama3.3\n")
        f.write(generated_data[i]['question'] + "\n")
        f.write("*******************************\n")
        f.write(generated_data[i]['response'] + "\n")
        f.write("=================================\n")
        f.write("### Result from mistral\n") 
        f.write(generated_data_mistral[i]['question'] + "\n")
        f.write("*******************************\n")
        f.write(generated_data_mistral[i]['response'] + "\n")
        f.write("\n\n")

print(f"Wrote {k} examples to {output_file}")

### For reference

### How to run the final generation
```bash
python scripts/generate.py --ds_path /new_data/knowledge/BMO/documents/seed_data.jsonl \
    --bs 2 --num_workers 10 \
    --save_path <your_save_path> \
    --flow /home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml \
    --checkpoint_dir <your_checkpoint_dir> \
    --endpoint <your_endpoint>
```

* For llama3.3 change the flow to `/home/lab/abhi/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5_llama3.3.yaml`