In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - pip install -r requirements.txt
 - pip install -e .
 - pip install rich datasets tabulate transformers

In [2]:
# Third Party
from datasets import load_dataset
from openai import OpenAI
import click

# First Party
from instructlab.sdg.flow import Flow
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils.docprocessor import DocProcessor
import sys
sys.path.append('../') 
from utils.data import postprocess_and_save, pretty_print_dict

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Create Seed Data

In [3]:
### Using docling v1
data_dir = 'document_collection/ibm-annual-report'
!OMP_NUM_THREADS=32 mamba run -n docling python ../scripts/docparser.py --input-dir {data_dir} --output-dir {data_dir}

### Using docling v2
data_dir = 'document_collection/ibm-annual-report'
!OMP_NUM_THREADS=32 mamba run -n docling python ../scripts/docparser_v2.py --input-dir {data_dir} --output-dir {data_dir} --c docling_v2_config.yaml

Traceback (most recent call last):
  File "/workspace/home/lab/abhi/sdg_demo/SDG-Research/examples/../scripts/docparser.py", line 8, in <module>
    from docling.datamodel.base_models import ConversionStatus
ModuleNotFoundError: No module named 'docling'

ERROR conda.cli.main_run:execute(125): `conda run python ../scripts/docparser.py --input-dir document_collection/ibm-annual-report --output-dir document_collection/ibm-annual-report` failed. (See above for error)


In [None]:
output_dir = f"sdg_demo_output/"
# This is where your PDFs are stored
data_dir = '../document_collection/ibm-annual-report' 
# It also have your QNA yaml file
dp = DocProcessor(data_dir, user_config_path=f'{data_dir}/qna.yaml')

### Using docling v1 json
seed_data = dp.get_processed_dataset()

### Using markdown file
seed_data = dp.get_processed_markdown_dataset([f"{data_dir}/ibm-annual-report-2024.md"])

# Note: For now v2 json is not supported

seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)
pretty_print_dict(f'{output_dir}/seed_data.jsonl')


### Setup OpenAI Client for interacting with the model

In [None]:
endpoint = f"http://localhost:8000/v1"
openai_api_key = "EMPTY"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

In [13]:
knowledge_agentic_pipeline = "/workspace/home/lab/abhi/sdg_demo/SDG-Research/src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [5]:
number_of_samples = 5
ds = load_dataset('json', data_files=f'{output_dir}/seed_data.jsonl', split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

In [None]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

### Run SDG through python command (For large scale generation)

```python
python /home/lab/sdg/scripts/generate.py --ds_path {output_dir}/seed_data.jsonl --bs 8 --num_workers 8 --save_path {output_dir}/gen.jsonl --flow ../src/instructlab/sdg/flows/generation/knowledge/synth_knowledge1.5.yaml --endpoint {teacher_endpoint_url} --checkpoint_dir {output_dir}/data_checkpoints --save_freq 2
```

### Save the generated data into training format

In [None]:
# Use the system prompt for LAB
system_prompt_lab = (
    "I am a LAB Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.1-8b-base model. My primary role is to serve as a chat assistant."
)
precomputed_skills_path = "<LAB precomputed skills path>"
# Path to pre-computed skills:
postprocess_and_save(f"{output_dir}/gen.jsonl", dataset_save_path=f'{output_dir}', precomputed_skills_path=precomputed_skills_path, sys_prompt=system_prompt_lab)