In [1]:
filename = "girkin_crying.txt"

### Text reading and preprocessing

In [2]:
def get_text(_filename):
    f = open(_filename, "r", encoding='utf-8')
    text = f.read()
    f.close()
    return text

def transform_symbol(_c):
    if 'а' <= _c and _c <= 'я':
        return _c
    elif _c <= 'Я' and _c >= 'А':
        return _c.lower()
    elif _c == 'Ё' or _c == 'ё':
        return 'е'
    else:
        return ' '
    
def preprocess_text(_text):
    _text = get_text(filename)
    text_formatted = ""
    # Change symbols according to requirements
    for c in _text:
        text_formatted += transform_symbol(c)

    # Remove consequtive spaces
    text_formatted = ' '.join(text_formatted.split())
    return text_formatted

In [3]:
text = preprocess_text(get_text(filename))

### Text processing (singular char count and bigram count)

In [4]:
def count_chars(_text):
    c_count = {}
    for c in _text:
        if c not in c_count:
            c_count[c] = 1
        else:
            c_count[c] = c_count[c] + 1 

    return dict(sorted(c_count.items()))

# Bigrams with intersection (ex: [1, 2], [2, 3], [3, 4])
def count_bigrams_w_i(_text):
    b_count = {}
    prev_char = _text[0]
    for c in _text[1:]:
        bg = prev_char + c
        prev_char = c
        if bg not in b_count:
            b_count[bg] = 1
        else:
            b_count[bg] = b_count[bg] + 1 

    return dict(sorted(b_count.items()))

# Bigrams without intersection (ex: [1, 2], [3, 4])
def count_bigrams_wo_i(_text):
    b_count = {}
    i = 1
    while i < len(_text):
        bg = _text[i - 1] + _text[i]
        if bg not in b_count:
            b_count[bg] = 1
        else:
            b_count[bg] = b_count[bg] + 1 
        i = i + 2

    return dict(sorted(b_count.items()))

In [5]:
chars_freq_wspaces = count_chars(text)
chars_freq_wospaces = chars_freq_wspaces.copy()
del chars_freq_wospaces[' ']

print(chars_freq_wspaces)

bigrams_freq_w_intersect = count_bigrams_w_i(text)
bigrams_freq_wo_intersect = count_bigrams_wo_i(text)

{' ': 86793, 'а': 37904, 'б': 8174, 'в': 22994, 'г': 7676, 'д': 14226, 'е': 43426, 'ж': 4720, 'з': 7408, 'и': 37250, 'й': 6380, 'к': 16426, 'л': 18174, 'м': 15420, 'н': 36030, 'о': 56742, 'п': 15740, 'р': 24918, 'с': 26566, 'т': 31194, 'у': 12712, 'ф': 1612, 'х': 5356, 'ц': 2422, 'ч': 6828, 'ш': 3240, 'щ': 1846, 'ъ': 164, 'ы': 8952, 'ь': 7650, 'э': 1236, 'ю': 3406, 'я': 9048}


### Show symbol frequencies

In [6]:
for k, v in chars_freq_wspaces.items():
    print(f"{k} : {v}")

  : 86793
а : 37904
б : 8174
в : 22994
г : 7676
д : 14226
е : 43426
ж : 4720
з : 7408
и : 37250
й : 6380
к : 16426
л : 18174
м : 15420
н : 36030
о : 56742
п : 15740
р : 24918
с : 26566
т : 31194
у : 12712
ф : 1612
х : 5356
ц : 2422
ч : 6828
ш : 3240
щ : 1846
ъ : 164
ы : 8952
ь : 7650
э : 1236
ю : 3406
я : 9048


### Calculate $H_1$ and $H_2$

In [7]:
import math

char_amount = sum(chars_freq_wspaces.values())
t1 = [chars_freq_wspaces[k] / char_amount for k in chars_freq_wspaces.keys()]
H1 = -sum(a * math.log2(a) for a in t1)

bg_amount = sum(bigrams_freq_w_intersect.values())
t2 = [bigrams_freq_w_intersect[k] / bg_amount for k in bigrams_freq_w_intersect.keys()]
H2 = -sum(a * math.log2(a) for a in t2) / 2
print(f"H1 = {H1}")
print(f"H2 = {H2}")

H1 = 4.385129362944809
H2 = 3.9881215496418245
