In [7]:
import re
from collections import defaultdict, Counter

In [11]:
import json
import re
from collections import defaultdict, Counter

class LabelStudioEditor:
    def __init__(self, file_path, encoding='utf-8'):
        self.file_path = file_path
        self.encoding = encoding
        try:
            self.data = self._load_data()
        except UnicodeDecodeError:
            # If UTF-8 fails, try with different encodings
            encodings_to_try = ['latin1', 'iso-8859-1', 'cp1252']
            for enc in encodings_to_try:
                try:
                    self.encoding = enc
                    self.data = self._load_data()
                    print(f"Successfully loaded file using {enc} encoding")
                    break
                except UnicodeDecodeError:
                    continue
            else:
                raise UnicodeDecodeError(f"Could not read file with any of these encodings: utf-8, {', '.join(encodings_to_try)}")

    def _load_data(self):
        with open(self.file_path, 'r', encoding=self.encoding) as f:
            return json.load(f)

    def view_annotations(self):
        for item in self.data:
            text = item['data']['text']
            print(f"Document Text: {text}")
            for annotation in item['annotations']:
                for result in annotation['result']:
                    print(f"Label: {result['value']['labels'][0]}, "
                          f"Text: {text[result['value']['start']:result['value']['end']]}")
            print("-" * 50)

    def label_stats(self):
        label_counter = Counter()
        total_annotations = 0

        for item in self.data:
            for annotation in item['annotations']:
                for result in annotation['result']:
                    label = result['value']['labels'][0]
                    label_counter[label] += 1
                    total_annotations += 1

        for label, count in label_counter.items():
            print(f"Label: {label}, Count: {count}")
        
        print(f"\nTotal number of annotations found: {total_annotations}")
        print(f"Total number of unique tags: {len(label_counter)}")

    def search_by_label(self, label):
        matches = []
        
        for item in self.data:
            text = item['data']['text']
            for annotation in item['annotations']:
                label_matches = [
                    result for result in annotation['result'] 
                    if label in result['value']['labels']
                ]
                
                if label_matches:
                    for match in label_matches:
                        matched_text = text[match['value']['start']:match['value']['end']]
                        matches.append({
                            'text': matched_text,
                            'label': label
                        })

        # Print matches
        for match in matches:
            print(f"Label: {match['label']}, Text: {match['text']}")

        print(f"\nNumber of tokens found with label '{label}': {len(matches)}")

    def search_by_token(self, token):
        matches = []
        
        for item in self.data:
            text = item['data']['text']
            if token in text:
                matches.append({
                    'text': text,
                    'annotations': [
                        result for annotation in item['annotations'] 
                        for result in annotation['result']
                    ]
                })

        # Print matches
        for match in matches:
            print(f"Text containing '{token}':")
            print(match['text'])
            for ann in match['annotations']:
                print(f"  Label: {ann['value']['labels'][0]}, "
                      f"Matched Text: {match['text'][ann['value']['start']:ann['value']['end']]}")
            print("-" * 50)

        print(f"\nNumber of documents containing '{token}': {len(matches)}")

    def remove_label(self, label_to_remove):
        for item in self.data:
            for annotation in item['annotations']:
                annotation['result'] = [
                    result for result in annotation['result'] 
                    if label_to_remove not in result['value']['labels']
                ]
        print(f"Label '{label_to_remove}' removed from all annotations.")

    def merge_labels(self, labels_to_merge, new_label):
        for item in self.data:
            for annotation in item['annotations']:
                for result in annotation['result']:
                    if result['value']['labels'][0] in labels_to_merge:
                        result['value']['labels'] = [new_label]
        print(f"Labels {labels_to_merge} merged into '{new_label}'.")

    def rename_labels(self, label_mapping):
        for item in self.data:
            for annotation in item['annotations']:
                for result in annotation['result']:
                    current_label = result['value']['labels'][0]
                    if current_label in label_mapping:
                        result['value']['labels'] = [label_mapping[current_label]]
        print(f"Labels renamed according to {label_mapping}.")

    def delete_annotations_with_label(self, label_to_delete):
        docs_affected = 0
        annotations_deleted = 0

        for item in self.data:
            original_annotation_count = len(item['annotations'])
            
            # Filter out annotations with the specified label
            for annotation in item['annotations'][:]:
                annotation['result'] = [
                    result for result in annotation['result'] 
                    if label_to_delete not in result['value']['labels']
                ]
                
                # If all results are removed, mark the annotation
                if not annotation['result']:
                    item['annotations'].remove(annotation)
                    annotations_deleted += 1

            # Check if any annotations were modified
            if len(item['annotations']) != original_annotation_count:
                docs_affected += 1

        print(f"Annotations with label '{label_to_delete}' have been deleted.")
        print(f"Number of documents affected: {docs_affected}")
        print(f"Number of annotations deleted: {annotations_deleted}")

    def save(self, output_file_path):
        try:
            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(self.data, f, ensure_ascii=False, indent=2)
            print(f"Conversion completed. Output saved to {output_file_path}")
        except Exception as e:
            raise Exception(f"Failed to save file: {str(e)}")

In [16]:
# Initialize the editor with the CoNLL file path
editor = LabelStudioEditor(r'c:\Users\Sakib Ahmed\Downloads\driving license_original.json')

In [17]:
# 1. View Annotations
editor.view_annotations()

Document Text: USA
Florida
DRIVER LICENSE
9CLASS E
Ad DUN B123 - 456-78-910 - 0
IJOHN
2DOE SAMPLE
81234 MAIN STREET LN
JACKSONVILLE FL12345
3 DOB 01/01/1995 15SEX M
4b EXP 01/01/2026 16HGT 5 '- 10 "
SAFE DRIVER
12 REOT NONE
9a END NONE
4aiss 01/01/2019
SDD E123456123456
- - L . aure ÃƒÂ 
Operation of a motor venicle constitutes
consent to any sobriely test required by law
Label: Country, Text: USA
Label: State, Text: Florida
Label: Address, Text: 81234 MAIN STREET LN
JACKSONVILLE FL12345
Label: Sex, Text: M
Label: Height, Text: 5 '- 10 "
Label: Restrictions, Text: NONE
Label: Endorsement, Text: NONE
Label: DD, Text: E123456123456
Label: Class, Text: E
Label: License_Number, Text: B123 - 456-78-910 - 0
Label: Last_Name, Text: JOHN
Label: First_Name, Text: DOE SAMPLE
--------------------------------------------------
Document Text: 50 - GTG
DRIVER ' S LICENSE
-
020133 DLI NO . 053274450 DOB 02/01/1995
CLASS C
EXP 02/01/2026
IONNSTATE MIDDLE
ADAMS
HZROCKWELL DR
LAGRANGE , GA 30240 - 9713 

In [18]:
# 2. Label Statistics
editor.label_stats()

Label: Country, Count: 390
Label: State, Count: 528
Label: Address, Count: 695
Label: Sex, Count: 424
Label: Height, Count: 433
Label: Restrictions, Count: 347
Label: Endorsement, Count: 348
Label: DD, Count: 316
Label: Class, Count: 383
Label: License_Number, Count: 593
Label: Last_Name, Count: 607
Label: First_Name, Count: 615
Label: Document_Number, Count: 184
Label: Eyes, Count: 412
Label: Weight, Count: 245
Label: Other, Count: 234
Label: Hair, Count: 127
Label: License_Type, Count: 30
Label: Race, Count: 1
Label: Birth_Place, Count: 111
Label: Authority, Count: 118
Label: Issuance_Number, Count: 35
Label: DL_Class, Count: 113

Total number of annotations found: 7289
Total number of unique tags: 23


In [None]:
# 3. Search Annotations with a specific label
editor.search_by_label('B-PER')

Line 3: Hulls -X- _ B-PER
Line 188: Areces -X- _ B-PER
Line 195: Juan -X- _ B-PER
Line 442: Jaime -X- _ B-PER
Line 446: Chevenement -X- _ B-PER
Line 680: Luis -X- _ B-PER
Line 749: Conchita -X- _ B-PER
Line 764: Martina -X- _ B-PER
Line 809: SuÃ¡rez -X- _ B-PER
...
Line 109121: Samani -X- _ B-PER

Number of tokens found with label 'B-PER': 1669
Number of sentences containing label 'B-PER': 1079


In [None]:
# 4. Search Annotations with a specific label
editor.search_by_token("Florida")

Line 26203: Florida -X- _ B-LOC

Number of tokens found with 'Florida': 1
Number of sentences containing 'Florida': 1


In [10]:
# 5. Remove specific label
editor.remove_label('B-PER')

Label 'B-PER' removed.


In [11]:
# 6. Merge multiple labels into one
editor.merge_labels(['B-MISC', 'I-MISC', 'B-ORG'], 'C-MISC')

Labels ['B-MISC', 'I-MISC', 'B-ORG'] merged into 'C-MISC'.


In [12]:
# Rechecking Label Statistics
editor.label_stats()

Label: O, Count: 236241
Label: C-MISC, Count: 12775
Label: B-LOC, Count: 4913
Label: I-ORG, Count: 4992
Label: I-LOC, Count: 1891
Label: I-PER, Count: 3903

Total number of labels found: 264715


In [13]:
# 7. Rename labels based on JSON mapping
editor.rename_labels({
    'I-PER':'A-MISC',
    'B-LOC':'A-LOC'
})

Labels renamed according to {'I-PER': 'A-MISC', 'B-LOC': 'A-LOC'}.


In [14]:
# Rechecking Label Statistics
editor.label_stats()

Label: O, Count: 236241
Label: C-MISC, Count: 12775
Label: A-LOC, Count: 4913
Label: I-ORG, Count: 4992
Label: I-LOC, Count: 1891
Label: A-MISC, Count: 3903

Total number of labels found: 264715


In [15]:
# 8. Delete an entire sentence containing an specific label
editor.delete_sentences_with_label("I-LOC")

Sentences containing the label 'I-LOC' have been deleted.
Number of sentences deleted: 413
Number of tokens deleted: 14029


In [16]:
# 9. Delete an entire sentence containing no label
editor.delete_sentences_without_annotations()


Sentences without annotations have been deleted successfully.
Number of sentences deleted: 2123
Number of tokens deleted: 31337
      


In [None]:
# Save the updated CoNLL file
editor.save('updated_file.json')


Updated file saved to updated_conll_file.conll
      
