In [2]:
import json
import re
from collections import Counter
import unicodedata

def analyze_special_characters(text):
    """
    Analyze text for special characters and potential encoding issues
    """
    # Find all non-ASCII characters
    special_chars = re.findall(r'[^\x00-\x7F]+', text)
    
    # Create counter for special characters
    char_counter = Counter(''.join(special_chars))
    
    # Analyze each character
    char_analysis = []
    for char, count in char_counter.items():
        try:
            name = unicodedata.name(char)
            category = unicodedata.category(char)
            hex_val = hex(ord(char))
            char_analysis.append({
                'character': char,
                'count': count,
                'unicode_name': name,
                'category': category,
                'hex_value': hex_val
            })
        except ValueError:
            # Handle characters that can't be identified
            char_analysis.append({
                'character': char,
                'count': count,
                'unicode_name': 'UNKNOWN',
                'category': 'UNKNOWN',
                'hex_value': hex(ord(char))
            })
    
    return char_analysis

def find_special_chars_in_json(json_file):
    """
    Find all special characters in a Label Studio JSON file
    """
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # Store all text content
        all_text = []
        
        def extract_text(obj):
            """Recursively extract all text from JSON structure"""
            if isinstance(obj, dict):
                for value in obj.values():
                    extract_text(value)
            elif isinstance(obj, list):
                for item in obj:
                    extract_text(item)
            elif isinstance(obj, str):
                all_text.append(obj)
        
        # Extract all text from the JSON
        extract_text(data)
        
        # Combine all text and analyze
        combined_text = ' '.join(all_text)
        analysis_results = analyze_special_characters(combined_text)
        
        # Sort by frequency
        analysis_results.sort(key=lambda x: x['count'], reverse=True)
        
        return analysis_results
    
    except Exception as e:
        return f"Error processing file: {str(e)}"

def print_analysis_report(analysis_results):
    """
    Print a formatted report of special characters found
    """
    print("\n=== Special Characters Analysis Report ===\n")
    print(f"{'Character':<10} {'Count':<8} {'Unicode Name':<30} {'Category':<10} {'Hex Value':<10}")
    print("-" * 70)
    
    for result in analysis_results:
        print(f"{result['character']:<10} {result['count']:<8} {result['unicode_name'][:30]:<30} "
              f"{result['category']:<10} {result['hex_value']:<10}")

def main():
    # Replace with your JSON file path
    json_file = r"d:\OneDrive - Personal\FleetBlox\Data\Driving Licences\Final Json\combined_data.json"
    
    print(f"Analyzing file: {json_file}")
    results = find_special_chars_in_json(json_file)
    
    if isinstance(results, str):
        print(f"Error: {results}")
    else:
        print_analysis_report(results)
        
        # Save results to a file
        with open('special_chars_report.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print("\nDetailed report saved to 'special_chars_report.json'")

if __name__ == "__main__":
    main()

Analyzing file: d:\OneDrive - Personal\FleetBlox\Data\Driving Licences\Final Json\combined_data.json

=== Special Characters Analysis Report ===

Character  Count    Unicode Name                   Category   Hex Value 
----------------------------------------------------------------------
Ã          187      LATIN CAPITAL LETTER A WITH TI Lu         0xc3      
           101      NO-BREAK SPACE                 Zs         0xa0      
Â          75       LATIN CAPITAL LETTER A WITH CI Lu         0xc2      
©          58       COPYRIGHT SIGN                 So         0xa9      
ƒ          55       LATIN SMALL LETTER F WITH HOOK Ll         0x192     
‰          18       PER MILLE SIGN                 Po         0x2030    
â          9        LATIN SMALL LETTER A WITH CIRC Ll         0xe2      
¬          8        NOT SIGN                       Sm         0xac      
€          8        EURO SIGN                      Sc         0x20ac    
¢          7        CENT SIGN                      Sc

In [27]:
import json
import re

def create_replacement_map():
    """
    Creates a mapping of special characters to their replacements.
    Each special character is mapped to a single character to maintain string length.
    """
    return {
        'Ã¢â€šÂ¬': '   ',
        'ÃƒÂ': ' ',
        'Ãƒâ€¡': '  ',
        'Ãƒ': ' ',
        # 'â€': 'A',
        # 'šA': 'A',
        # 'šA': 'A',
        '¬': '-',
        'Â': 'A',
        'Ã': 'A',
        'Â': 'A',
        # '©': 'c',
        # 'ƒ': 'f',
        # '‰': '%',
        'â': 'a',
        # '¬': '-',
        # '€': 'E',
        # '¢': 'c',
        # 'š': 's',
        # '§': 'S',
        # '¨': 'e',
        # '‡': '+',
        # '‚': ',',
        # '´': "'",
        # 'ª': 'a',
        # '¡': 'i',
        # 'ˆ': '^'
    }

def clean_text_preserve_length(text, char_map):
    """
    Cleans text by replacing special characters while maintaining string length.
    """
    for special_char, replacement in char_map.items():
        text = text.replace(special_char, replacement)
    return text

def process_annotations(data):
    """
    Processes the Label Studio JSON data, cleaning text while preserving annotation positions.
    """
    char_map = create_replacement_map()
    
    # Process each item in the data
    for item in data:
        # Clean the text in the data section
        if 'data' in item and 'text' in item['data']:
            item['data']['text'] = clean_text_preserve_length(item['data']['text'], char_map)
            
        # Update the text in annotations if present
        if 'annotations' in item:
            for annotation in item['annotations']:
                if 'result' in annotation:
                    for result in annotation['result']:
                        if 'value' in result and 'text' in result['value']:
                            result['value']['text'] = clean_text_preserve_length(
                                result['value']['text'], 
                                char_map
                            )
    return data

def clean_label_studio_file(input_file, output_file):
    """
    Main function to clean the Label Studio JSON file.
    """
    try:
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Process the data
        cleaned_data = process_annotations(data)
        
        # Write output file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
            
        print(f"Successfully cleaned file and saved to {output_file}")
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")

# Example usage
if __name__ == "__main__":
    input_file = r"d:\OneDrive - Personal\FleetBlox\Data\Driving Licences\Final Json\Final Combined_Edited.json"
    output_file = "cleaned_label_studio_data.json"
    clean_label_studio_file(input_file, output_file)

Successfully cleaned file and saved to cleaned_label_studio_data.json
