In [1]:
import json
import re
from collections import Counter
import unicodedata

def analyze_special_characters(text):
    """
    Analyze text for special characters and potential encoding issues
    """
    # Find all non-ASCII characters
    special_chars = re.findall(r'[^\x00-\x7F]+', text)
    
    # Create counter for special characters
    char_counter = Counter(''.join(special_chars))
    
    # Analyze each character
    char_analysis = []
    for char, count in char_counter.items():
        try:
            name = unicodedata.name(char)
            category = unicodedata.category(char)
            hex_val = hex(ord(char))
            char_analysis.append({
                'character': char,
                'count': count,
                'unicode_name': name,
                'category': category,
                'hex_value': hex_val
            })
        except ValueError:
            # Handle characters that can't be identified
            char_analysis.append({
                'character': char,
                'count': count,
                'unicode_name': 'UNKNOWN',
                'category': 'UNKNOWN',
                'hex_value': hex(ord(char))
            })
    
    return char_analysis

def find_special_chars_in_json(json_file):
    """
    Find all special characters in a Label Studio JSON file
    """
    try:
        # Read JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # Store all text content
        all_text = []
        
        def extract_text(obj):
            """Recursively extract all text from JSON structure"""
            if isinstance(obj, dict):
                for value in obj.values():
                    extract_text(value)
            elif isinstance(obj, list):
                for item in obj:
                    extract_text(item)
            elif isinstance(obj, str):
                all_text.append(obj)
        
        # Extract all text from the JSON
        extract_text(data)
        
        # Combine all text and analyze
        combined_text = ' '.join(all_text)
        analysis_results = analyze_special_characters(combined_text)
        
        # Sort by frequency
        analysis_results.sort(key=lambda x: x['count'], reverse=True)
        
        return analysis_results
    
    except Exception as e:
        return f"Error processing file: {str(e)}"

def print_analysis_report(analysis_results):
    """
    Print a formatted report of special characters found
    """
    print("\n=== Special Characters Analysis Report ===\n")
    print(f"{'Character':<10} {'Count':<8} {'Unicode Name':<30} {'Category':<10} {'Hex Value':<10}")
    print("-" * 70)
    
    for result in analysis_results:
        print(f"{result['character']:<10} {result['count']:<8} {result['unicode_name'][:30]:<30} "
              f"{result['category']:<10} {result['hex_value']:<10}")

def main():
    # Replace with your JSON file path
    json_file = r"c:\Users\Sakib Ahmed\Downloads\final_project-33-at-2024-11-26-23-07-87c5857c.json"
    
    print(f"Analyzing file: {json_file}")
    results = find_special_chars_in_json(json_file)
    
    if isinstance(results, str):
        print(f"Error: {results}")
    else:
        print_analysis_report(results)
        
        # Save results to a file
        with open('special_chars_report.json', 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print("\nDetailed report saved to 'special_chars_report.json'")

if __name__ == "__main__":
    main()

Analyzing file: c:\Users\Sakib Ahmed\Downloads\final_project-33-at-2024-11-26-23-07-87c5857c.json

=== Special Characters Analysis Report ===

Character  Count    Unicode Name                   Category   Hex Value 
----------------------------------------------------------------------
           82       NO-BREAK SPACE                 Zs         0xa0      
©          58       COPYRIGHT SIGN                 So         0xa9      
‰          18       PER MILLE SIGN                 Po         0x2030    
¬          8        NOT SIGN                       Sm         0xac      
¢          7        CENT SIGN                      Sc         0xa2      
š          7        LATIN SMALL LETTER S WITH CARO Ll         0x161     
§          5        SECTION SIGN                   Po         0xa7      
¨          4        DIAERESIS                      Sk         0xa8      
‡          4        DOUBLE DAGGER                  Po         0x2021    
‚          3        SINGLE LOW-9 QUOTATION MARK    Ps   

In [None]:
def fix_encoding(text):
    """
    Carefully fix common encoding issues without changing text structure.
    Handles various special characters, symbols, and encoding artifacts.
    """
    # Mapping of common problematic character sequences
    encoding_fixes = {
        # Original mappings
        'ÃƒÂ': 'A',
        'Ã': 'A',
        'ƒÂ': '',
        'â€™': "'",  # Smart quote
        'â€œ': '"',  # Left double quote
        'â€': '"',   # Right double quote
        'Ã©': 'e',
        'Ã¨': 'e',
        'Â': ' ',

        # Space and formatting characters
        '\xa0': ' ',    # NO-BREAK SPACE
        '\u2028': ' ',  # LINE SEPARATOR
        '\u2029': ' ',  # PARAGRAPH SEPARATOR

        # Common symbols
        '©': '(c)',     # COPYRIGHT SIGN
        '‰': '%',       # PER MILLE SIGN
        '¬': '-',       # NOT SIGN
        '¢': 'c',       # CENT SIGN
        'š': 's',       # LATIN SMALL LETTER S WITH CARON
        '§': 'S',       # SECTION SIGN
        '¨': '',        # DIAERESIS
        '‡': '+',       # DOUBLE DAGGER
        '‚': "'",       # SINGLE LOW-9 QUOTATION MARK
        'ƒ': 'f',       # LATIN SMALL LETTER F WITH HOOK
        '´': "'",       # ACUTE ACCENT
        'ª': 'a',       # FEMININE ORDINAL INDICATOR
        '¡': '!',       # INVERTED EXCLAMATION MARK
        'ˆ': '^',       # MODIFIER LETTER CIRCUMFLEX ACCENT
        'â': 'a',       # LATIN SMALL LETTER A WITH CIRCUMFLEX

        # Additional clean-up for multiple spaces and line breaks
        '  ': ' ',      # Double space to single space
        '\r\n': '\n',   # Normalize line endings
        '\r': '\n',     # Carriage return to newline
    }
    
    # Apply replacements
    for bad_char, replacement in encoding_fixes.items():
        text = text.replace(bad_char, replacement)
    
    # Final cleanup for any remaining double spaces
    text = ' '.join(text.split())
    
    return text

def clean_json_with_annotations(input_file, output_file):
    """
    Clean encoding while preserving JSON structure and annotations
    """
    import json
    
    # Read the original file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Recursive cleaning function
    def deep_clean(obj):
        if isinstance(obj, dict):
            return {k: deep_clean(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [deep_clean(item) for item in obj]
        elif isinstance(obj, str):
            return fix_encoding(obj)
        else:
            return obj
    
    # Clean the entire data structure
    cleaned_data = deep_clean(data)
    
    # Write the cleaned data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
    
    print(f"File cleaned and saved to {output_file}")

# Usage
input_file = r'd:\OneDrive - Personal\final.json'
output_file = 'cleaned_pastde.json'
clean_json_with_annotations(input_file, output_file)

File cleaned and saved to 10.json


In [30]:
def fix_encoding(text, annotations=None):
    """
    Fix encoding issues while preserving annotation positions.
    Returns cleaned text and adjusted annotations.
    """
    # Create a mapping of original positions to new positions
    position_map = {}
    current_pos = 0
    cleaned_text = ""
    
    # Process text character by character
    i = 0
    while i < len(text):
        matched = False
        
        # Check for multi-character replacements first
        for bad_char, replacement in encoding_fixes.items():
            if text[i:i+len(bad_char)] == bad_char:
                position_map[i] = current_pos
                cleaned_text += replacement
                current_pos += len(replacement)
                i += len(bad_char)
                matched = True
                break
        
        # If no multi-character replacement found, process single character
        if not matched:
            position_map[i] = current_pos
            if text[i] in ['\xa0', '\u2028', '\u2029']:
                cleaned_text += ' '
            else:
                cleaned_text += text[i]
            current_pos += 1
            i += 1
    
    # Adjust annotations if provided
    if annotations:
        adjusted_annotations = []
        for ann in annotations:
            start = position_map.get(ann['start'], ann['start'])
            end = position_map.get(ann['end'], ann['end'])
            adjusted_ann = ann.copy()
            adjusted_ann.update({'start': start, 'end': end})
            adjusted_annotations.append(adjusted_ann)
        return cleaned_text, adjusted_annotations
    
    return cleaned_text

def clean_json_with_annotations(input_file, output_file):
    """
    Clean encoding while preserving JSON structure and annotations
    """
    import json
    
    # Read the original file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    def deep_clean(obj):
        if isinstance(obj, dict):
            # Check if this is an annotated text entry
            if 'text' in obj and 'annotations' in obj:
                cleaned_text, adjusted_annotations = fix_encoding(
                    obj['text'], 
                    obj['annotations']
                )
                return {
                    **obj,
                    'text': cleaned_text,
                    'annotations': adjusted_annotations
                }
            return {k: deep_clean(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [deep_clean(item) for item in obj]
        elif isinstance(obj, str):
            return fix_encoding(obj)
        else:
            return obj
    
    # Clean the entire data structure
    cleaned_data = deep_clean(data)
    
    # Write the cleaned data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
    
    print(f"File cleaned and saved to {output_file}")

# The encoding_fixes dictionary remains the same as in your original code
encoding_fixes = {
    # Original mappings
    'ÃƒÂ': 'A',
    'Ã': 'A',
    'ƒÂ': '',
    'â€™': "'",  # Smart quote
    'â€œ': '"',  # Left double quote
    'â€': '"',   # Right double quote
    'Ã©': 'e',
    'Ã¨': 'e',
    'Â': ' ',

    # Space and formatting characters
    '\xa0': ' ',    # NO-BREAK SPACE
    '\u2028': ' ',  # LINE SEPARATOR
    '\u2029': ' ',  # PARAGRAPH SEPARATOR

    # Common symbols
    '©': '(c)',     # COPYRIGHT SIGN
    '‰': '%',       # PER MILLE SIGN
    '¬': '-',       # NOT SIGN
    '¢': 'c',       # CENT SIGN
    'š': 's',       # LATIN SMALL LETTER S WITH CARON
    '§': 'S',       # SECTION SIGN
    '¨': '',        # DIAERESIS
    '‡': '+',       # DOUBLE DAGGER
    '‚': "'",       # SINGLE LOW-9 QUOTATION MARK
    'ƒ': 'f',       # LATIN SMALL LETTER F WITH HOOK
    '´': "'",       # ACUTE ACCENT
    'ª': 'a',       # FEMININE ORDINAL INDICATOR
    '¡': '!',       # INVERTED EXCLAMATION MARK
    'ˆ': '^',       # MODIFIER LETTER CIRCUMFLEX ACCENT
    'â': 'a',       # LATIN SMALL LETTER A WITH CIRCUMFLEX

    # Additional clean-up for multiple spaces and line breaks
    '  ': ' ',      # Double space to single space
    '\r\n': '\n',   # Normalize line endings
    '\r': '\n',     # Carriage return to newline
}

# Usage
input_file = r'd:\OneDrive - Personal\FleetBlox\Data\Driving Licences\Final Json\train_USA.json'
output_file = '1.json'
clean_json_with_annotations(input_file, output_file)


File cleaned and saved to 1.json
