In [45]:
def split_conll(conll_file: str, output_dir: str, ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1)):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Check if the sum of ratios is valid
    if sum(ratio) > 1.0:
        raise ValueError("The sum of the split ratios exceeds 1.0. Please provide valid ratios.")

    with open(conll_file, 'r', encoding='utf-8') as f:
        # Read the entire conll file and split by blank lines (indicating a new sentence)
        data = f.read().strip().split('\n\n')

    # Check and retain the first line (-DOCSTART- line) in all sets
    doc_start = data[0] if data[0].startswith("-DOCSTART-") else None
    if doc_start:
        data = data[1:]  # Remove -DOCSTART- for further splitting

    # Shuffle the data to randomize the split
    random.shuffle(data)

    # Calculate split indices
    total_length = len(data)
    train_end_idx = int(total_length * ratio[0])
    dev_end_idx = train_end_idx + int(total_length * ratio[1])

    # Split data based on the calculated indices and ratio values
    train_data = data[:train_end_idx] if ratio[0] > 0 else []
    dev_data = data[train_end_idx:dev_end_idx] if ratio[1] > 0 else []
    test_data = data[dev_end_idx:] if ratio[2] > 0 else []

    # Add back -DOCSTART- at the start of each split if it exists
    if doc_start:
        if train_data:
            train_data.insert(0, doc_start)
        if dev_data:
            dev_data.insert(0, doc_start)
        if test_data:
            test_data.insert(0, doc_start)

    # Write to output files based on presence of data
    files_created = []
    if train_data:
        write_to_file(os.path.join(output_dir, 'train.conll'), train_data)
        files_created.append(('train.conll', len(train_data)))
    if dev_data:
        write_to_file(os.path.join(output_dir, 'dev.conll'), dev_data)
        files_created.append(('dev.conll', len(dev_data)))
    if test_data:
        write_to_file(os.path.join(output_dir, 'test.conll'), test_data)
        files_created.append(('test.conll', len(test_data)))

    print("Split complete. Files created:")
    for file, count in files_created:
        print(f"- {file}: {count} sentences")
    
    if len(files_created) < 3:
        print("\nNote: Not all splits were created due to the provided ratios.")
        if ratio[2] == 0:
            print("The test split was not created because its ratio was set to 0.")

In [46]:
conll_file = r"c:\Users\Sakib Ahmed\Desktop\dev_combined.conll"
output_dir = r"c:\Users\Sakib Ahmed\Desktop"
split_conll(conll_file, output_dir, ratio=(0.6, 0.2, .2))  # Only training and validation, no test

Split complete. Files created:
- train.conll: 109 sentences
- dev.conll: 37 sentences
- test.conll: 37 sentences


In [47]:
conll_file = r"c:\Users\Sakib Ahmed\Desktop\dev_combined.conll"
output_dir = r"c:\Users\Sakib Ahmed\Desktop"
split_conll(conll_file, output_dir, ratio=(0.8, 0.2, .0))  # Only training and validation, no test

Split complete. Files created:
- train.conll: 145 sentences
- dev.conll: 37 sentences

Note: Not all splits were created due to the provided ratios.
The test split was not created because its ratio was set to 0.
