# Import required libraries

In [5]:
import json
import os
from collections import Counter

In [1]:
def process_dataset(input_file, output_file):
    # Read the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    total_sentences = len(lines)
    processed_lines = []
    new_id = 0
    
    for line in lines:
        data = json.loads(line)
        # Check if the sentence has any annotation other than 0
        if any(tag != 0 for tag in data['ner_tags']):
            data['id'] = str(new_id)
            # Ensure the JSON structure is identical to the original
            processed_lines.append(json.dumps(data, ensure_ascii=False, separators=(',', ':')))
            new_id += 1

    sentences_deleted = total_sentences - len(processed_lines)
    remaining_sentences = len(processed_lines)

    # Write the processed data to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(line + '\n')

    print(f"Total sentences found: {total_sentences}")
    print(f"Total sentences with no annotation deleted: {sentences_deleted}")
    print(f"Final remaining sentences: {remaining_sentences}")
    print(f"Processing complete. Output written to {output_file}")

## Single File

In [3]:
# Usage
input_file = r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json"
output_file = input_file
process_dataset(input_file, output_file)

Total sentences found: 551
Total sentences with no annotation deleted: 0
Final remaining sentences: 551
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json


## Multiple Files

In [4]:
# Define the list of input files
input_files = [
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - Only Per.json",
]

# Iterate over each file and process it
for input_file in input_files:
    output_file = input_file  # Output file is the same as input file
    process_dataset(input_file, output_file)

Total sentences found: 551
Total sentences with no annotation deleted: 0
Final remaining sentences: 551
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json
Total sentences found: 400
Total sentences with no annotation deleted: 332
Final remaining sentences: 68
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\dev - Only Per.json
Total sentences found: 3960
Total sentences with no annotation deleted: 1519
Final remaining sentences: 2441
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\train - No Per.json
Total sentences found: 1865
Total sentences with no annotation deleted: 1461
Final remaining sentences: 404
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\train - Only Per.json
Total sentences found: 850
Total sentences with no annotation deleted: 306
Final remaining sentences: 544
Processing complete. Output written to C:\Users\Sakib Ahmed\Desktop\samples\test - N

In [6]:
def count_unique_labels(file_path):
    all_labels = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            all_labels.extend(data['ner_tags'])
    
    label_counts = Counter(all_labels)
    
    print("Unique labels and their counts:")
    for label, count in sorted(label_counts.items()):
        print(f"Label {label}: {count} occurrences")
    
    print(f"\nTotal number of unique labels: {len(label_counts)}")

In [8]:
# Usage
file_path = r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json"  # Replace with your actual file path
count_unique_labels(file_path)

Unique labels and their counts:
Label 0: 15653 occurrences
Label 1: 668 occurrences
Label 2: 462 occurrences
Label 3: 495 occurrences
Label 4: 162 occurrences
Label 5: 195 occurrences
Label 6: 344 occurrences

Total number of unique labels: 7


In [9]:
# Define the list of input files
file_paths = [
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\dev - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\train - Only Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - No Per.json",
    r"C:\Users\Sakib Ahmed\Desktop\samples\test - Only Per.json",
]

# Iterate over each file and process it
for file_path in file_paths:
    count_unique_labels(file_path)

Unique labels and their counts:
Label 0: 15653 occurrences
Label 1: 668 occurrences
Label 2: 462 occurrences
Label 3: 495 occurrences
Label 4: 162 occurrences
Label 5: 195 occurrences
Label 6: 344 occurrences

Total number of unique labels: 7
Unique labels and their counts:
Label 0: 2271 occurrences
Label 7: 110 occurrences
Label 8: 57 occurrences

Total number of unique labels: 3
Unique labels and their counts:
Label 0: 69979 occurrences
Label 1: 3216 occurrences
Label 2: 2098 occurrences
Label 3: 2299 occurrences
Label 4: 878 occurrences
Label 5: 835 occurrences
Label 6: 1192 occurrences

Total number of unique labels: 7
Unique labels and their counts:
Label 0: 14408 occurrences
Label 7: 676 occurrences
Label 8: 465 occurrences

Total number of unique labels: 3
Unique labels and their counts:
Label 0: 15554 occurrences
Label 1: 666 occurrences
Label 2: 370 occurrences
Label 3: 537 occurrences
Label 4: 191 occurrences
Label 5: 155 occurrences
Label 6: 215 occurrences

Total number of 