In [1]:
import json

def process_dataset(input_file, output_file):
    # Read the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    total_sentences = len(lines)
    processed_lines = []
    new_id = 0
    
    for line in lines:
        data = json.loads(line)
        # Check if the sentence has any annotation other than 0
        if any(tag != 0 for tag in data['ner_tags']):
            data['id'] = str(new_id)
            # Ensure the JSON structure is identical to the original
            processed_lines.append(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
            new_id += 1

    sentences_deleted = total_sentences - len(processed_lines)
    remaining_sentences = len(processed_lines)

    # Write the processed data to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(line + '\n')

    print(f"Total sentences found: {total_sentences}")
    print(f"Total sentences with no annotation deleted: {sentences_deleted}")
    print(f"Final remaining sentences: {remaining_sentences}")
    print(f"Processing complete. Output written to {output_file}")

## Single File

In [3]:
# Usage
input_file = r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json"
output_file = input_file
process_dataset(input_file, output_file)

Total sentences found: 551
Total sentences with no annotation deleted: 0
Final remaining sentences: 551
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json


## Multiple Files

In [4]:
import os

# Define the list of input files
input_files = [
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - Only Per.json",
]

# Iterate over each file and process it
for input_file in input_files:
    output_file = input_file  # Output file is the same as input file
    process_dataset(input_file, output_file)

Total sentences found: 551
Total sentences with no annotation deleted: 0
Final remaining sentences: 551
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json
Total sentences found: 400
Total sentences with no annotation deleted: 332
Final remaining sentences: 68
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - Only Per.json
Total sentences found: 3960
Total sentences with no annotation deleted: 1519
Final remaining sentences: 2441
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\train - No Per.json
Total sentences found: 1865
Total sentences with no annotation deleted: 1461
Final remaining sentences: 404
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\train - Only Per.json
Total sentences found: 850
Total sentences with no annotation deleted: 306
Final remaining sentences: 544
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\test - N