# Preparing Document for Data generation
- This notebook will show you how to do document parsing
- Document Chunking
- And finally mixing it with user QNA to  create seed examples

### Install SDG

```bash 
pip install sdg-hub==0.1.0a2
pip install rich datasets tabulate transformers
```

### Create Seed Examples

In [None]:
from sdg_hub.utils.chunking import chunk_document
import yaml
from glob import glob
from datasets import Dataset

# output directory
output_dir = f"sdg_demo_output/"
# This is where your PDFs are stored
data_dir = 'wikipedia/' 

# Let us load the QNA yaml file
with open(f'{data_dir}/qna.yaml', 'r') as f:
    qna = yaml.safe_load(f)

chunk_size = 5000
max_model_context_length = 8000 
all_ds = []
list_md_files = glob(f"{data_dir}/*.md")

for md_file in list_md_files:
    chunks_mds = []
    print(f"Readiing contents of file {md_file}")
    with open(md_file, "r", encoding="utf-8") as f:
        text = f.read()
        chunks_mds.extend([{
    "document": e,
    "document_outline": qna["document_outline"],
    "domain": qna["domain"],
    } for e in chunk_document(text, server_ctx_size=max_model_context_length, chunk_word_count=chunk_size)])
    all_ds.extend(chunks_mds)

all_ds_with_icls = []
for icl in qna['seed_examples']:
    icl_dict = {}
    icl_dict['icl_document'] = icl['context']
    
    icl_dict['icl_query_1'] = icl['questions_and_answers'][0]['question']
    icl_dict['icl_response_1'] = icl['questions_and_answers'][0]['answer']

    icl_dict['icl_query_2'] = icl['questions_and_answers'][1]['question']
    icl_dict['icl_response_2'] = icl['questions_and_answers'][1]['answer']

    icl_dict['icl_query_3'] = icl['questions_and_answers'][2]['question']
    icl_dict['icl_response_3'] = icl['questions_and_answers'][2]['answer']

    all_ds_with_icls.extend([{**e, **icl_dict} for e in all_ds])

seed_data = Dataset.from_list(all_ds_with_icls)

seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)
