In [1]:
# Please specify the path to the corpus file downloaded from the link below
# https://lang.org.ua/en/ubertext/
corpus_path = '/Users/horielov.d/Downloads/ubertext-cleaned.txt'

# The total number of news in the Uber Text 2.0 News Cleaned corpus at the time of download
total_news_count = 14232850

# Optional step to calculate the number of news in the corpus for the percentage calculation
# from helpers.read_news_stream import read_news_stream
# 
# total_news_count = 0
# for news_text in read_news_stream(corpus_path):
#     total_news_count += 1

# 4.2 Phone Number Normalization Results
## 4.2.1 Methodology

In [4]:
import re
from sources.normalizers.ukrainian_phone_normalizer import UkrainianPhoneNormalizer
from sources.helpers.read_news_stream import read_news_stream
from helpers.format_phone_number import format_phone_number

stats_before_normalization = {}
stats_after_normalization = {}
current_news_number = 0
progress_step = 100_000

def calculate_different_phone_numbers_format(news_text: str):
    normalized_line = news_text
    
    for pattern in UkrainianPhoneNormalizer.REGULAR_PHONE_PATTERNS:
        def replace(match):
            if UkrainianPhoneNormalizer.should_not_match(normalized_line, match):
                return match.group(0)

            formatted_phone = format_phone_number(match.group(0))
            stats_before_normalization[formatted_phone] = stats_before_normalization.get(formatted_phone, 0) + 1

            result = UkrainianPhoneNormalizer.format_regular_phone_number(match)

            formatted_result = format_phone_number(result)

            stats_after_normalization[formatted_result] = stats_after_normalization.get(formatted_result, 0) + 1

            return result

        normalized_line = re.sub(pattern, replace, normalized_line)

    for pattern in UkrainianPhoneNormalizer.SPECIAL_PHONE_PATTERNS:
        def replace(match):
            if UkrainianPhoneNormalizer.should_not_match(normalized_line, match):
                return match.group(0)

            formatted_phone = format_phone_number(match.group(0))
            stats_before_normalization[formatted_phone] = stats_before_normalization.get(formatted_phone, 0) + 1

            result = UkrainianPhoneNormalizer.format_special_phone_number(match)

            formatted_result = format_phone_number(result)

            stats_after_normalization[formatted_result] = stats_after_normalization.get(formatted_result, 0) + 1

            return result

        normalized_line = re.sub(pattern, replace, normalized_line)

for news_text in read_news_stream(corpus_path):
    current_news_number += 1
    calculate_different_phone_numbers_format(news_text)
    
    if current_news_number % progress_step == 0:
        print("Current stats:")
        for phone_format, count in sorted(stats_before_normalization.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f'{phone_format}: {count}')
        
        print(f'\nProcessed ({current_news_number / total_news_count * 100:.2f}%) \n')
    

Current stats:
(0XX) XXX-XX-XX: 23
(0XX) XXX XXXX: 12
0XX-XXX-XX-XX: 10
(0XX) XXX XX XX: 8
0XX XXX XX XX: 6
0XX-XX-XXX-XX: 4
(0XXX)XX-XX-XX: 4
(0XXX) XX-XX-XX: 3
(0XX)XXX XXXX: 3
+38 0XX XXX XX XX: 2

Processed (0.70%) 
Current stats:
(0XX) XXX-XX-XX: 33
0XX-XXX-XX-XX: 20
(0XX) XXX XXXX: 13
(0XX) XXX XX XX: 12
0XX XXX XX XX: 8
+38 (0XX) XXX-XX-XX: 6
+38 0XX XXX XX XX: 5
0XXXXXXXXX: 5
0XX-XX-XXX-XX: 4
(0XXX)XX-XX-XX: 4

Processed (1.41%) 
Current stats:
(0XX) XXX-XX-XX: 63
0XX-XXX-XX-XX: 21
(0XX)XXXXXXX: 14
(0XX) XXX XX XX: 13
(0XX) XXX XXXX: 13
(0XXX) XX-XX-XX: 11
0XX XXX XX XX: 9
+38 0XX XXX XX XX: 8
(0XX)XXX-XX-XX: 8
(0XXX) XXXXXX: 8

Processed (2.11%) 
Current stats:
(0XX) XXX-XX-XX: 70
0XX-XXX-XX-XX: 21
(0XX)XXXXXXX: 14
(0XX) XXX XX XX: 13
(0XX) XXX XXXX: 13
(0XXX) XX-XX-XX: 11
+38 0XX XXX XX XX: 9
0XX XXX XX XX: 9
(0XX)XXX-XX-XX: 8
(0XXX) XXXXXX: 8

Processed (2.81%) 
Current stats:
(0XX) XXX-XX-XX: 87
0XX-XXX-XX-XX: 23
(0XX) XXX XX XX: 17
(0XX)XXXXXXX: 14
(0XX) XXX XXXX: 13
(0XXX

## 4.2.2 Distribution Before Normalization

In [18]:
import pandas as pd

stats_before_normalization_df = pd.DataFrame(stats_before_normalization.items(), columns=['Phone Number', 'Count'])
stats_before_normalization_df['Percentage'] = stats_before_normalization_df['Count'] / stats_before_normalization_df['Count'].sum() * 100
stats_before_normalization_df = stats_before_normalization_df.sort_values(by='Count', ascending=False)
stats_before_normalization_df = stats_before_normalization_df.reset_index(drop=True)

# Count total number of distinct formats
total_formats = len(stats_before_normalization_df)

# Count formats that appear only once
formats_appearing_once = len(stats_before_normalization_df[stats_before_normalization_df['Count'] == 1])

# Calculate percentage of the top 10 formats
top_10_percentage = stats_before_normalization_df.head(10)['Count'].sum() / stats_before_normalization_df['Count'].sum() * 100

# Calculate the percentage for the remaining formats
remaining_percentage = 100 - top_10_percentage

print(f"Total distinct formats: {total_formats}")
print(f"Formats appearing only once: {formats_appearing_once}")
print(f"Top 10 formats percentage: {top_10_percentage:.1f}%")
print(f"Remaining {total_formats - 10} formats percentage: {remaining_percentage:.1f}%")

stats_before_normalization_df.head(10)

Total distinct formats: 302
Formats appearing only once: 73
Top 10 formats percentage: 69.2%
Remaining 292 formats percentage: 30.8%


Unnamed: 0,Phone Number,Count,Percentage
0,(0XX) XXX-XX-XX,11827,19.258451
1,0XX-XXX-XX-XX,6407,10.432814
2,0XX XXX XX XX,4964,8.083111
3,0XXXXXXXXX,4232,6.891161
4,+380 (XXX) XX-XX-XX,4047,6.589917
5,(0XX) XXX XX XX,3443,5.606396
6,(0XXX) XX-XX-XX,2474,4.028529
7,+38 0XX XXX XX XX,1751,2.851234
8,+380XXXXXXXXX,1723,2.805641
9,0 XXX XXX XXX,1616,2.631408


## 4.2.3 Distribution After Normalization

In [19]:
stats_after_normalization_df = pd.DataFrame(stats_after_normalization.items(), columns=['Phone Number', 'Count'])
stats_after_normalization_df['Percentage'] = stats_after_normalization_df['Count'] / stats_after_normalization_df['Count'].sum() * 100
stats_after_normalization_df = stats_after_normalization_df.sort_values(by='Count', ascending=False)
stats_after_normalization_df = stats_after_normalization_df.reset_index(drop=True)

stats_after_normalization_df


Unnamed: 0,Phone Number,Count,Percentage
0,+380 (XX) XXX-XX-XX,51489,83.84192
1,+380 (XXX) XX-XX-XX,9923,16.15808


# Apostrophe and Quotation Mark Normalization Results Calculation

In [24]:
from sources.normalizers.constants import Constants
from sources.normalizers.apostrophe_normalizer import ApostropheNormalizer
from sources.normalizers.quotation_marks_normalizer import QuotationMarksNormalizer
from sources.normalizers.redundant_apostrophe_spaces_normalizer import RedundantApostropheSpacesNormalizer

apostrophe_stats_before_normalization = {}
quotation_marks_stats_before_normalization = {}
apostrophe_stats_after_apostrophe_normalization = {}
quotation_marks_stats_after_quotation_marks_normalization = {}

warning_apostrophe_count = 0
error_apostrophe_count = 0
warning_quotation_mark_count = 0
error_quotation_mark_count = 0

current_news_number = 0
progress_step = 100_000

def setup_stats():
    print("Setting up stats")
    print("Apostrophes:", Constants.APOSTROPHES)
    print("Quotes:",Constants.QUOTATION_MARKS)
    
    for apostrophe in Constants.APOSTROPHES:
        apostrophe_stats_before_normalization[apostrophe] = 0
        apostrophe_stats_after_apostrophe_normalization[apostrophe] = 0
    
    for quotation_mark in Constants.QUOTATION_MARKS:
        quotation_marks_stats_before_normalization[quotation_mark] = 0
        quotation_marks_stats_after_quotation_marks_normalization[quotation_mark] = 0

def calculate_apostrophe_and_quotation_marks_normalization(news_text: str):
    global warning_apostrophe_count, error_apostrophe_count, warning_quotation_mark_count, error_quotation_mark_count
    
    # REDUNDANT APOSTROPHE SPACES
    normalized_text, _, _ = RedundantApostropheSpacesNormalizer.normalize(news_text)
    
    # INITIAL STATS
    for pattern in apostrophe_stats_before_normalization.keys():
        apostrophe_stats_before_normalization[pattern] += normalized_text.count(pattern)
    
    for pattern in quotation_marks_stats_before_normalization.keys():
        quotation_marks_stats_before_normalization[pattern] += normalized_text.count(pattern)

    # APOSTROPHE
    apostrophe_normalization_result = ApostropheNormalizer.normalize(normalized_text)
    
    normalized_text = apostrophe_normalization_result[0]
    warning_apostrophe_count += len(apostrophe_normalization_result[1])
    error_apostrophe_count += len(apostrophe_normalization_result[2])
    
    for pattern in apostrophe_stats_after_apostrophe_normalization.keys():
        apostrophe_stats_after_apostrophe_normalization[pattern] += normalized_text.count(pattern)
    
    
    # QUOTATION MARKS
    quotation_marks_normalization_result = QuotationMarksNormalizer.normalize(normalized_text)
    normalized_text = quotation_marks_normalization_result[0]
    warning_quotation_mark_count += len(quotation_marks_normalization_result[1])
    error_quotation_mark_count += len(quotation_marks_normalization_result[2])
    
    for pattern in quotation_marks_stats_after_quotation_marks_normalization.keys():
        quotation_marks_stats_after_quotation_marks_normalization[pattern] += normalized_text.count(pattern)

setup_stats()

for news_text in read_news_stream(corpus_path):
    current_news_number += 1
    calculate_apostrophe_and_quotation_marks_normalization(news_text)
    
    if current_news_number % progress_step == 0:
        print(f'\nProcessed ({current_news_number / total_news_count * 100:.2f}%) \n')

print("Finished processing")


Setting up stats
Apostrophes: ["'", 'ʹ', 'ʻ', 'ʼ', '‘', '’', '`']
Quotes: ['"', '«', '»', '“', '”', '‟', '„', '❝', '❞']

Processed (0.70%) 

Processed (1.41%) 

Processed (2.11%) 

Processed (2.81%) 

Processed (3.51%) 

Processed (4.22%) 

Processed (4.92%) 

Processed (5.62%) 

Processed (6.32%) 

Processed (7.03%) 

Processed (7.73%) 

Processed (8.43%) 

Processed (9.13%) 

Processed (9.84%) 

Processed (10.54%) 

Processed (11.24%) 

Processed (11.94%) 

Processed (12.65%) 

Processed (13.35%) 

Processed (14.05%) 

Processed (14.75%) 

Processed (15.46%) 

Processed (16.16%) 

Processed (16.86%) 

Processed (17.56%) 

Processed (18.27%) 

Processed (18.97%) 

Processed (19.67%) 

Processed (20.38%) 

Processed (21.08%) 

Processed (21.78%) 

Processed (22.48%) 

Processed (23.19%) 

Processed (23.89%) 

Processed (24.59%) 

Processed (25.29%) 

Processed (26.00%) 

Processed (26.70%) 

Processed (27.40%) 

Processed (28.10%) 

Processed (28.81%) 

Processed (29.51%) 

Processed (

# 4.3 Apostrophe Normalization Results

In [30]:
import pandas as pd
import unicodedata
from helpers.unicode import unicode

# Assuming these variables already contain the count data:
# apostrophe_stats_before_normalization = {...}
# apostrophe_stats_after_apostrophe_normalization = {...}
# warning_apostrophe_count, error_apostrophe_count = ...

# Create a dataframe for apostrophe statistics
def create_apostrophe_results_df(before_stats, after_stats):
    # Convert dictionaries to dataframe
    data = []

    for symbol in before_stats.keys():
        before_count = before_stats[symbol]
        after_count = after_stats[symbol]

        # Get unicode code point
        unicode_code = unicode(symbol)

        data.append({
            'Unicode': unicode_code,
            'Symbol': symbol,
            'Before Normalization': before_count,
            'After Normalization': after_count
        })

    df = pd.DataFrame(data)

    # Calculate total counts
    total_before = df['Before Normalization'].sum()
    total_after = df['After Normalization'].sum()

    # Add percentage columns
    df['Percentage Before'] = (df['Before Normalization'] / total_before * 100).round(4)
    df['Percentage After'] = (df['After Normalization'] / total_after * 100).round(4)

    # Sort by count before normalization
    df = df.sort_values(by='Before Normalization', ascending=False)

    # Add a row for totals
    totals_row = pd.DataFrame([{
        'Unicode': 'Total',
        'Symbol': '',
        'Before Normalization': total_before,
        'After Normalization': total_after,
        'Percentage Before': 100,
        'Percentage After': 100
    }])

    df = pd.concat([df, totals_row])

    # Reformat the table for readability in publication
    publication_table = df[['Unicode', 'Symbol', 'Before Normalization', 'Percentage Before',
                            'After Normalization', 'Percentage After']]

    return df, publication_table

# Calculate additional statistics for analysis
def calculate_apostrophe_statistics(df, warning_count, error_count):
    # Exclude the 'Total' row for these calculations
    df_no_total = df[df['Unicode'] != 'Total']

    # Total symbols before normalization
    total_before = df_no_total['Before Normalization'].sum()

    # Total symbols after normalization
    total_after = df_no_total['After Normalization'].sum()

    # Standard apostrophe (U+02BC) percentage before and after
    standard_apostrophe_before = df_no_total[df_no_total['Unicode'] == 'U+02BC']['Percentage Before'].values[0]
    standard_apostrophe_after = df_no_total[df_no_total['Unicode'] == 'U+02BC']['Percentage After'].values[0]

    # Most common apostrophes and their percentages
    top_apostrophes = df_no_total.sort_values(by='Percentage Before', ascending=False).head(2)
    top1_symbol = top_apostrophes.iloc[0]['Unicode']
    top1_percentage = top_apostrophes.iloc[0]['Percentage Before']
    top2_symbol = top_apostrophes.iloc[1]['Unicode']
    top2_percentage = top_apostrophes.iloc[1]['Percentage Before']

    # Symbols that were completely eliminated (changed to 0)
    eliminated_symbols = df_no_total[df_no_total['After Normalization'] == 0]

    # Symbols with minimal remaining presence
    remaining_symbols = df_no_total[(df_no_total['After Normalization'] > 0) &
                                    (df_no_total['Unicode'] != 'U+02BC')]

    # Calculate the conversion to quotation marks (estimate based on difference in totals)
    converted_to_quotes = total_before - total_after
    quotes_percentage = (converted_to_quotes / total_before) * 100

    # Unchanged symbols percentage (based on warnings)
    unchanged_symbols = sum(df_no_total[df_no_total['Unicode'] != 'U+02BC']['After Normalization'])
    unchanged_percentage = (unchanged_symbols / total_after) * 100

    # Print statistics in a format suitable for paper text
    print("Apostrophe Normalization Statistics:")
    print(f"Total apostrophes before normalization: {total_before:,}")
    print(f"Total apostrophes after normalization: {total_after:,}")
    print(f"U+02BC percentage before: {standard_apostrophe_before:.2f}%")
    print(f"U+02BC percentage after: {standard_apostrophe_after:.2f}%")
    print(f"Most common apostrophe symbols: {top1_symbol} ({top1_percentage:.1f}%) and {top2_symbol} ({top2_percentage:.1f}%)")
    print(f"Apostrophes converted to quotation marks: ~{quotes_percentage:.2f}%")
    print(f"Warning count: {warning_count:,} (related to non-standard patterns)")
    print(f"Error count: {error_count:,} (related to ambiguity resolution)")
    print(f"Unchanged apostrophes percentage: {unchanged_percentage:.4f}%")

    # Return the statistics as a dictionary for further use
    return {
        'total_before': total_before,
        'total_after': total_after,
        'standard_before_pct': standard_apostrophe_before,
        'standard_after_pct': standard_apostrophe_after,
        'top1_symbol': top1_symbol,
        'top1_percentage': top1_percentage,
        'top2_symbol': top2_symbol,
        'top2_percentage': top2_percentage,
        'converted_to_quotes_pct': quotes_percentage,
        'warning_count': warning_count,
        'error_count': error_count,
        'unchanged_percentage': unchanged_percentage
    }


df, publication_table = create_apostrophe_results_df(apostrophe_stats_before_normalization, 
                                                   apostrophe_stats_after_apostrophe_normalization)
stats = calculate_apostrophe_statistics(df, warning_apostrophe_count, error_apostrophe_count)

# Stats preview

for key, value in stats.items():
    print(f"{key}: {value}")

publication_table

Apostrophe Normalization Statistics:
Total apostrophes before normalization: 12,337,416
Total apostrophes after normalization: 12,310,319
U+02BC percentage before: 1.17%
U+02BC percentage after: 99.94%
Most common apostrophe symbols: U+0027 (60.5%) and U+2019 (38.0%)
Apostrophes converted to quotation marks: ~0.22%
Error count: 10,733 (related to ambiguity resolution)
Unchanged apostrophes percentage: 0.0597%
total_before: 12337416
total_after: 12310319
standard_before_pct: 1.1714
standard_after_pct: 99.9403
top1_symbol: U+0027
top1_percentage: 60.5074
top2_symbol: U+2019
top2_percentage: 38.0461
converted_to_quotes_pct: 0.21963270104534044
error_count: 10733
unchanged_percentage: 0.05971413088482923


Unnamed: 0,Unicode,Symbol,Before Normalization,Percentage Before,After Normalization,Percentage After
0,U+0027,',7465048,60.5074,5519,0.0448
5,U+2019,’,4693904,38.0461,1226,0.01
3,U+02BC,ʼ,144521,1.1714,12302968,99.9403
6,U+0060,`,20895,0.1694,397,0.0032
4,U+2018,‘,12824,0.1039,209,0.0017
1,U+02B9,ʹ,214,0.0017,0,0.0
2,U+02BB,ʻ,10,0.0001,0,0.0
0,Total,,12337416,100.0,12310319,100.0


# 4.4 Quotation Marks Normalization Results

In [33]:
import pandas as pd
import unicodedata

# Assuming these variables already contain the count data from your code
# quotation_marks_stats_before_normalization = {...}
# quotation_marks_stats_after_quotation_marks_normalization = {...}
# warning_quotation_mark_count, error_quotation_mark_count = ...

# Define a function to create a readable symbol description
def get_symbol_description(char):
    try:
        name = unicodedata.name(char)
        return f"{char} ({name})"
    except:
        return char

# Create a dataframe for quotation mark statistics
def create_quotation_results_df(before_stats, after_stats):
    # Convert dictionaries to dataframe
    data = []

    for symbol in before_stats.keys():
        before_count = before_stats[symbol]
        after_count = after_stats[symbol]

        # Get unicode code point
        unicode_code = f"U+{ord(symbol):04X}"

        data.append({
            'Unicode': unicode_code,
            'Symbol': symbol,
            'Before Normalization': before_count,
            'After Normalization': after_count
        })

    df = pd.DataFrame(data)

    # Calculate total counts
    total_before = df['Before Normalization'].sum()
    total_after = df['After Normalization'].sum()

    # Add percentage columns
    df['Percentage Before'] = (df['Before Normalization'] / total_before * 100).round(3)
    df['Percentage After'] = (df['After Normalization'] / total_after * 100).round(3)

    # Sort by count before normalization
    df = df.sort_values(by='Before Normalization', ascending=False)

    # Add a row for totals
    totals_row = pd.DataFrame([{
        'Unicode': 'Total',
        'Symbol': '',
        'Before Normalization': total_before,
        'After Normalization': total_after,
        'Percentage Before': 100,
        'Percentage After': 100
    }])

    df = pd.concat([df, totals_row])

    # Reformat the table for readability in publication
    publication_table = df[['Unicode', 'Symbol', 'Before Normalization', 'Percentage Before',
                            'After Normalization', 'Percentage After']]

    return df, publication_table

# Calculate additional statistics for analysis
def calculate_quotation_statistics(df, warning_count, error_count):
    # Exclude the 'Total' row for these calculations
    df_no_total = df[df['Unicode'] != 'Total']

    # Total symbols before normalization
    total_before = df_no_total['Before Normalization'].sum()

    # Total symbols after normalization
    total_after = df_no_total['After Normalization'].sum()

    # Group symbols by type
    guillemets_before = df_no_total[df_no_total['Unicode'].isin(['U+00AB', 'U+00BB'])]['Before Normalization'].sum()
    guillemets_after = df_no_total[df_no_total['Unicode'].isin(['U+00AB', 'U+00BB'])]['After Normalization'].sum()

    curly_quotes_before = df_no_total[df_no_total['Unicode'].isin(['U+201C', 'U+201D'])]['Before Normalization'].sum()
    curly_quotes_after = df_no_total[df_no_total['Unicode'].isin(['U+201C', 'U+201D'])]['After Normalization'].sum()

    straight_quotes_before = df_no_total[df_no_total['Unicode'] == 'U+0022']['Before Normalization'].sum()
    straight_quotes_after = df_no_total[df_no_total['Unicode'] == 'U+0022']['After Normalization'].sum()

    # Calculate percentages
    guillemets_pct_before = (guillemets_before / total_before) * 100
    guillemets_pct_after = (guillemets_after / total_after) * 100

    curly_quotes_pct_before = (curly_quotes_before / total_before) * 100
    curly_quotes_pct_after = (curly_quotes_after / total_after) * 100

    straight_quotes_pct_before = (straight_quotes_before / total_before) * 100
    straight_quotes_pct_after = (straight_quotes_after / total_after) * 100

    # Calculate balance between opening and closing quotes
    opening_guillemets_before = df_no_total[df_no_total['Unicode'] == 'U+00AB']['Before Normalization'].values[0]
    closing_guillemets_before = df_no_total[df_no_total['Unicode'] == 'U+00BB']['Before Normalization'].values[0]

    opening_guillemets_after = df_no_total[df_no_total['Unicode'] == 'U+00AB']['After Normalization'].values[0]
    closing_guillemets_after = df_no_total[df_no_total['Unicode'] == 'U+00BB']['After Normalization'].values[0]

    opening_curly_before = df_no_total[df_no_total['Unicode'] == 'U+201C']['Before Normalization'].values[0]
    closing_curly_before = df_no_total[df_no_total['Unicode'] == 'U+201D']['Before Normalization'].values[0]

    opening_curly_after = df_no_total[df_no_total['Unicode'] == 'U+201C']['After Normalization'].values[0]
    closing_curly_after = df_no_total[df_no_total['Unicode'] == 'U+201D']['After Normalization'].values[0]

    # Calculate balance ratios
    if closing_guillemets_before > 0:
        guillemets_balance_before = abs(opening_guillemets_before / closing_guillemets_before - 1)
    else:
        guillemets_balance_before = float('inf')

    if closing_guillemets_after > 0:
        guillemets_balance_after = abs(opening_guillemets_after / closing_guillemets_after - 1)
    else:
        guillemets_balance_after = float('inf')

    if closing_curly_before > 0:
        curly_balance_before = abs(opening_curly_before / closing_curly_before - 1)
    else:
        curly_balance_before = float('inf')

    if closing_curly_after > 0:
        curly_balance_after = abs(opening_curly_after / closing_curly_after - 1)
    else:
        curly_balance_after = float('inf')

    # Calculate improvement factors
    if guillemets_balance_before > 0 and guillemets_balance_after > 0:
        guillemets_improvement = guillemets_balance_before / guillemets_balance_after
    else:
        guillemets_improvement = float('inf')

    if curly_balance_before > 0 and curly_balance_after > 0:
        curly_improvement = curly_balance_before / curly_balance_after
    else:
        curly_improvement = float('inf')

    # Print statistics in a format suitable for paper text
    print("Quotation Mark Normalization Statistics:")
    print(f"Total quotation marks before normalization: {total_before:,}")
    print(f"Total quotation marks after normalization: {total_after:,}")
    print(f"Straight quotes (U+0022) before: {straight_quotes_pct_before:.2f}%")
    print(f"Guillemets (« ») before: {guillemets_pct_before:.2f}%")
    print(f"Curly quotes (\" \") before: {curly_quotes_pct_before:.2f}%")
    print(f"Straight quotes (U+0022) after: {straight_quotes_pct_after:.2f}%")
    print(f"Guillemets (« ») after: {guillemets_pct_after:.2f}%")
    print(f"Curly quotes (\" \") after: {curly_quotes_pct_after:.2f}%")
    print(f"Guillemets balance improvement: {guillemets_improvement:.2f}x")
    print(f"Curly quotes balance improvement: {curly_improvement:.2f}x")
    print(f"Warning count: {warning_count:,}, {warning_count/total_after*100:.2f}% of total quotes after normalization")
    print(f"Error count: {error_count:,} ({error_count/total_after*100:.2f}% of total quotes after normalization)")

    # Return the statistics as a dictionary for further use
    return {
        'total_before': total_before,
        'total_after': total_after,
        'straight_quotes_pct_before': straight_quotes_pct_before,
        'guillemets_pct_before': guillemets_pct_before,
        'curly_quotes_pct_before': curly_quotes_pct_before,
        'straight_quotes_pct_after': straight_quotes_pct_after,
        'guillemets_pct_after': guillemets_pct_after,
        'curly_quotes_pct_after': curly_quotes_pct_after,
        'guillemets_improvement': guillemets_improvement,
        'curly_improvement': curly_improvement,
        'warning_count': warning_count,
        'warning_percentage': warning_count/total_after*100,
        'error_count': error_count,
        'error_percentage': error_count/total_after*100
    }

# Example usage:
df, publication_table = create_quotation_results_df(quotation_marks_stats_before_normalization, 
                                                  quotation_marks_stats_after_quotation_marks_normalization)
stats = calculate_quotation_statistics(df, warning_quotation_mark_count, error_quotation_mark_count)

# Stats preview
for key, value in stats.items():
    print(f"{key}: {value}")

publication_table

Quotation Mark Normalization Statistics:
Total quotation marks before normalization: 56,051,936
Total quotation marks after normalization: 56,076,689
Straight quotes (U+0022) before: 51.74%
Guillemets (« ») before: 44.30%
Curly quotes (" ") before: 3.77%
Straight quotes (U+0022) after: 0.01%
Guillemets (« ») after: 95.82%
Curly quotes (" ") after: 4.16%
Guillemets balance improvement: 6.40x
Curly quotes balance improvement: 51.96x
Error count: 351 (0.00% of total quotes after normalization)
total_before: 56051936
total_after: 56076689
straight_quotes_pct_before: 51.74266237655021
guillemets_pct_before: 44.29581343987833
curly_quotes_pct_before: 3.7666638312011202
straight_quotes_pct_after: 0.013873857638064188
guillemets_pct_after: 95.82446638388369
curly_quotes_pct_after: 4.161659758478251
guillemets_improvement: 6.402931671576889
curly_improvement: 51.95758609453572
error_count: 351
error_percentage: 0.0006259285386838728


Unnamed: 0,Unicode,Symbol,Before Normalization,Percentage Before,After Normalization,Percentage After
0,U+0022,"""",29002764,51.743,7780,0.014
1,U+00AB,«,12495485,22.293,26895177,47.961
2,U+00BB,»,12333176,22.003,26840011,47.863
4,U+201D,”,1065203,1.9,1166659,2.08
3,U+201C,“,1046085,1.866,1167062,2.081
6,U+201E,„,109199,0.195,0,0.0
5,U+201F,‟,16,0.0,0,0.0
7,U+275D,❝,4,0.0,0,0.0
8,U+275E,❞,4,0.0,0,0.0
0,Total,,56051936,100.0,56076689,100.0
