
# Prepare Combined Train, Dev, Test Files for MTL-Bioinformatics-2016 - Batch Processing

This notebook scans all dataset folders inside `MTL-Bioinformatics-2016/data`, reads train/dev/test splits, and groups the datasets into **batches of 5 datasets each**.

For each batch, it creates:
- `combined_train_batch_X.json`
- `combined_dev_batch_X.json`
- `combined_test_batch_X.json`

This approach helps manage memory and allows training in smaller steps.


In [None]:

!pip install datasets


In [None]:

import os
import json

# Path to dataset directory
DATASET_DIR = "MTL-Bioinformatics-2016/data"

# Number of datasets per batch
DATASETS_PER_BATCH = 5

def load_ner_data(folder):
    """Load NER data from train, dev, and test splits."""
    split_data = {'train': [], 'dev': [], 'test': []}
    for split in ['train', 'dev', 'test']:
        file_path = os.path.join(folder, f'{split}.txt')
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                tokens, tags = [], []
                for line in file:
                    if line.strip() == "":
                        if tokens:
                            split_data[split].append({'tokens': tokens, 'tags': tags})
                            tokens, tags = [], []
                    else:
                        token, tag = line.strip().split()
                        tokens.append(token)
                        tags.append(tag)
                if tokens:
                    split_data[split].append({'tokens': tokens, 'tags': tags})
    return split_data


In [None]:

# Get all dataset folders
all_datasets = sorted([f for f in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, f))])

# Group datasets into batches of 5
batches = [all_datasets[i:i + DATASETS_PER_BATCH] for i in range(0, len(all_datasets), DATASETS_PER_BATCH)]

# Process each batch and combine train/dev/test data
def combine_and_save_batch(batch_number, datasets):
    combined_data = {'train': [], 'dev': [], 'test': []}
    for dataset in datasets:
        dataset_path = os.path.join(DATASET_DIR, dataset)
        data = load_ner_data(dataset_path)
        for split in ['train', 'dev', 'test']:
            combined_data[split].extend(data[split])

    # Save combined JSON files for this batch
    for split in ['train', 'dev', 'test']:
        output_file = f'combined_{split}_batch_{batch_number}.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(combined_data[split], f, indent=2)
        print(f"Saved {output_file} with {len(combined_data[split])} examples")

# Process all batches
for batch_number, dataset_batch in enumerate(batches, start=1):
    print(f"Processing Batch {batch_number}: {dataset_batch}")
    combine_and_save_batch(batch_number, dataset_batch)

print("✅ All batches processed and saved.")
