In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - pip install -r requirements.txt
 - pip install -e .
 - pip install rich datasets tabulate

In [5]:
# Third Party
from datasets import load_dataset
from openai import OpenAI
import click

# First Party
from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils.docprocessor import DocProcessor
from utils.data import postprocess_and_save, pretty_print_dict

### Create Seed Data

In [None]:
!OMP_NUM_THREADS=32 mamba run -n docling python /workspace/home/lab/abhi/sdg_demo/SDG-Research/scripts/docparser.py --input-dir {data_dir} --output-dir {data_dir}

In [None]:
output_dir = f"sdg_demo_output/"
# This is where your PDFs are stored
data_dir = 'document_collection/RBC' 
# It also have your QNA yaml file
dp = DocProcessor(data_dir, user_config_path=f'{data_dir}/qna.yaml')
seed_data = dp.get_processed_dataset()
seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)
pretty_print_dict(f'{output_dir}/seed_data.jsonl')

### Setup OpenAI Client for interacting with the model

In [None]:
endpoint = f"http://localhost:8000/v1"
openai_api_key = "EMPTY"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

In [10]:
knowledge_agentic_pipeline = "/workspace/home/lab/abhi/sdg_demo/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [None]:
number_of_samples = 5
ds = load_dataset('json', data_files=f'{output_dir}/seed_data.jsonl', split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [None]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

### Run SDG through python command (For large scale generation)

```python
python /home/lab/sdg/scripts/generate.py --ds_path {output_dir}/seed_data.jsonl --bs 8 --num_workers 8 --save_path {output_dir}/gen.jsonl --flow SynthKnowledgeFlow1.5 --endpoint {teacher_endpoint_url} --checkpoint_dir {output_dir}/data_checkpoints --save_freq 2
```

### Save the generated data into training format

In [None]:
system_prompt = (
    "<Newest RHELAI system prompt>"
)
precomputed_skills_path = "<RHELAI precomputed skills path>"
postprocess_and_save(f"{output_dir}/gen.jsonl", dataset_save_path=f'{output_dir}', sys_prompt=system_prompt)