# Import necessary Libraries

In [1]:
import re
from collections import Counter
import logging
import unicodedata

# Defining necessary functions

In [3]:
def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        logging.error(f"Error reading file: {e}")
        return None

def check_specific_special_chars(text):
    # Define the specific special characters we're looking for
    specific_chars = 'äöüßÄÖÜαβγδεζηθικλμνξοπρστυφχψωΔ'
    found_chars = set()
    
    for char in specific_chars:
        if char in text:
            found_chars.add(char)
    
    logging.info("\n=== Specific Special Characters Found ===")
    if found_chars:
        for char in sorted(found_chars):
            char_name = unicodedata.name(char)
            logging.info(f"Found '{char}' - {char_name}")
    else:
        logging.info("None of the specific special characters were found in the text")
    
    return found_chars

def analyze_text_tokens(text):
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logger = logging.getLogger(__name__)
    
    # Print initial text info
    logger.info("\n=== Input Text Statistics ===")
    logger.info(f"Input text length: {len(text)} characters")
    
    pattern = r"""
        (?:
            \w*[äöüßÄÖÜαβγδεζηθικλμνξοπρστυφχψωΔ]+\w*  # Words with special chars
        )
    """
    
    # Find all matches
    initial_tokens = re.findall(pattern, text, re.VERBOSE)
    
    # Clean tokens
    initial_tokens = [
        token.strip(' "\'') 
        for token in initial_tokens 
        if not any(char.isdigit() for char in token)
    ]
    
    # Create summary statistics
    summary_stats = {
        "total_initial_tokens": len(initial_tokens),
        "unique_tokens": len(set(initial_tokens)),
        "special_chars": set(),
        "special_char_count": 0
    }
    
    # Analyze special characters
    for token in initial_tokens:
        for char in token:
            if not char.isascii():
                summary_stats["special_char_count"] += 1
                char_name = unicodedata.name(char, 'UNKNOWN')
                char_category = unicodedata.category(char)
                summary_stats["special_chars"].add((char, char_name, char_category))
    
    # Print comprehensive summary
    logger.info("\n=== Token Analysis Summary ===")
    logger.info(f"Total tokens detected: {summary_stats['total_initial_tokens']}")
    logger.info(f"Unique tokens: {summary_stats['unique_tokens']}")
    logger.info(f"Total special characters found: {summary_stats['special_char_count']}")
    
    if summary_stats["special_chars"]:
        logger.info("\n=== Special Characters Details ===")
        for char, name, category in sorted(summary_stats["special_chars"]):
            logger.info(f"Character: '{char}' - Unicode: {name}, Category: {category}")
    
    # Analyze all tokens with special characters
    token_counts = Counter(initial_tokens)
    
    if token_counts:
        logger.info("\n=== Tokens with Special Characters ===")
        for token, count in sorted(token_counts.items()):
            if count > 1:
                logger.info(f"'{token}' appears {count} times")
            else:
                logger.info(f"'{token}' appears once")
    
    return summary_stats["special_chars"]

def detect_character_set(text):
    logging.info("\n=== Character Set Analysis ===")
    text_without_digits = ''.join(char for char in text if not char.isdigit())
    unique_chars = set(text_without_digits)
    special_chars = [c for c in unique_chars if not c.isascii()]
    
    found_special_chars = set()
    for char in sorted(special_chars):
        try:
            char_name = unicodedata.name(char)
            char_category = unicodedata.category(char)
            logging.info(f"Character: '{char}' - Unicode Name: {char_name}, Category: {char_category}")
            found_special_chars.add((char, char_name, char_category))
        except ValueError:
            logging.info(f"Character: '{char}' - Unknown Unicode name")
    
    return found_special_chars

# Running the function

In [4]:
if __name__ == "__main__":
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    
    # Specify your input file path
    file_path = r'd:\OneDrive - Green Energy\Desktop\HF_NER_Med\data\kitchen\reports\ocr_texts.txt'  # Replace with your file path
    
    # Read the text file
    text = read_text_file(file_path)
    
    if text:
        # Check for specific special characters first
        specific_chars_found = check_specific_special_chars(text)
        
        # Then analyze the character set
        char_set_results = detect_character_set(text)
        
        # Finally process the tokens
        token_results = analyze_text_tokens(text)



=== Specific Special Characters Found ===
Found 'Ä' - LATIN CAPITAL LETTER A WITH DIAERESIS
Found 'Ö' - LATIN CAPITAL LETTER O WITH DIAERESIS
Found 'Ü' - LATIN CAPITAL LETTER U WITH DIAERESIS
Found 'ß' - LATIN SMALL LETTER SHARP S
Found 'ä' - LATIN SMALL LETTER A WITH DIAERESIS
Found 'ö' - LATIN SMALL LETTER O WITH DIAERESIS
Found 'ü' - LATIN SMALL LETTER U WITH DIAERESIS
Found 'δ' - GREEK SMALL LETTER DELTA
Found 'μ' - GREEK SMALL LETTER MU

=== Character Set Analysis ===
Character: '«' - Unicode Name: LEFT-POINTING DOUBLE ANGLE QUOTATION MARK, Category: Pi
Character: '°' - Unicode Name: DEGREE SIGN, Category: So
Character: '»' - Unicode Name: RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK, Category: Pf
Character: 'Ä' - Unicode Name: LATIN CAPITAL LETTER A WITH DIAERESIS, Category: Lu
Character: 'Ö' - Unicode Name: LATIN CAPITAL LETTER O WITH DIAERESIS, Category: Lu
Character: 'Ü' - Unicode Name: LATIN CAPITAL LETTER U WITH DIAERESIS, Category: Lu
Character: 'ß' - Unicode Name: LATIN SMA