imports

In [79]:
import random
random.seed(202)
from bitarray import bitarray, bits2bytes
import pandas as pd
import plotly.express as px
import math


In [2]:
with open('../Jack_London_-_The_Sea_Wolf_ascii.txt', 'r') as file:
    text = file.read()


In [3]:
# sorted(list(set(text)))


In [4]:
# Implementation based on https://www.cs.helsinki.fi/u/tpkarkka/publications/jacm05-revised.pdf and https://mailund.dk/posts/skew-python-go/

import numpy as np
import numba
from typing import Tuple


@numba.jit()
def merge(x: np.array, SA12: np.array, SA3: np.array) -> np.array:
    "Merge the suffixes in sorted SA12 and SA3."
    ISA = np.zeros((len(x),), dtype='int')
    for i in range(len(SA12)):
        ISA[SA12[i]] = i
    SA = np.zeros((len(x),), dtype='int')
    idx = 0
    i, j = 0, 0
    while i < len(SA12) and j < len(SA3):
        if less(x, SA12[i], SA3[j], ISA):
            SA[idx] = SA12[i]
            idx += 1
            i += 1
        else:
            SA[idx] = SA3[j]
            idx += 1
            j += 1
    if i < len(SA12):
        SA[idx:len(SA)] = SA12[i:]
    elif j < len(SA3):
        SA[idx:len(SA)] = SA3[j:]
    return SA


@numba.jit()
def u_idx(i: int, m: int) -> int:
    "Map indices in u back to indices in the original string."
    if i < m:
        return 1 + 3 * i
    else:
        return 2 + 3 * (i - m - 1)


@numba.jit()
def safe_idx(x: np.array, i: int) -> int:
    "Hack to get zero if we index beyond the end."
    return 0 if i >= len(x) else x[i]


@numba.jit()
def symbcount(x: np.array, asize: int) -> np.array:
    "Count how often we see each character in the alphabet."
    counts = np.zeros((asize,), dtype="int")
    for c in x:
        counts[c] += 1
    return counts


@numba.jit()
def cumsum(counts: np.array) -> np.array:
    "Compute the cumulative sum from the character count."
    res = np.zeros((len(counts, )), dtype='int')
    acc = 0
    for i, k in enumerate(counts):
        res[i] = acc
        acc += k
    return res


@numba.jit()
def bucket_sort(x: np.array, asize: int,
                idx: np.array, offset: int = 0) -> np.array:
    "Sort indices in idx according to x[i + offset]."
    sort_symbs = np.array([safe_idx(x, i + offset) for i in idx])
    counts = symbcount(sort_symbs, asize)
    buckets = cumsum(counts)
    out = np.zeros((len(idx),), dtype='int')
    for i in idx:
        bucket = safe_idx(x, i + offset)
        out[buckets[bucket]] = i
        buckets[bucket] += 1
    return out


@numba.jit()
def radix3(x: np.array, asize: int, idx: np.array) -> np.array:
    "Sort indices in idx according to their first three letters in x."
    idx = bucket_sort(x, asize, idx, 2)
    idx = bucket_sort(x, asize, idx, 1)
    return bucket_sort(x, asize, idx)


@numba.jit()
def triplet(x: np.array, i: int) -> Tuple[int, int, int]:
    "Extract the triplet (x[i],x[i+1],x[i+2])."
    return safe_idx(x, i), safe_idx(x, i + 1), safe_idx(x, i + 2)


@numba.jit()
def collect_alphabet(x: np.array, idx: np.array) -> Tuple[np.array, int]:
    "Map the triplets starting at idx to a new alphabet."
    alpha = np.zeros((len(x),), dtype='int')
    value = 1
    last_trip = -1, -1, -1
    for i in idx:
        trip = triplet(x, i)
        if trip != last_trip:
            value += 1
            last_trip = trip
        alpha[i] = value
    return alpha, value - 1


@numba.jit()
def build_u(x: np.array, alpha: np.array) -> np.array:
    "Construct u string, using 1 as central sentinel."
    a = np.array([alpha[i] for i in range(1, len(x), 3)] +
                 [1] +
                 [alpha[i] for i in range(2, len(x), 3)])
    return a


@numba.jit()
def less(x: np.array, i: int, j: int, ISA: np.array) -> bool:
    "Check if x[i:] < x[j:] using the inverse suffix array for SA12."
    a: int = safe_idx(x, i)
    b: int = safe_idx(x, j)
    if a < b:
        return True
    if a > b:
        return False
    if i % 3 != 0 and j % 3 != 0:
        return ISA[i] < ISA[j]
    return less(x, i + 1, j + 1, ISA)


@numba.jit()
def skew_rec(x: np.array, asize: int) -> np.array:
    "skew/DC3 SA construction algorithm."

    SA12 = np.array([i for i in range(len(x)) if i % 3 != 0])

    SA12 = radix3(x, asize, SA12)
    new_alpha, new_asize = collect_alphabet(x, SA12)
    if new_asize < len(SA12):
        # Recursively sort SA12
        u = build_u(x, new_alpha)
        sa_u = skew_rec(u, new_asize + 2)
        m = len(sa_u) // 2
        SA12 = np.array([u_idx(i, m) for i in sa_u if i != m])

    if len(x) % 3 == 1:
        SA3 = np.array([len(x) - 1] + [i - 1 for i in SA12 if i % 3 == 1])
    else:
        SA3 = np.array([i - 1 for i in SA12 if i % 3 == 1])
    SA3 = bucket_sort(x, asize, SA3)
    return merge(x, SA12, SA3)


def get_suffix_array(x: str) -> np.array:
    if "$" in x:
        raise ValueError('Text should not contain $')
    str_to_int = {
        "$": 0,  # End of strig
    }
    str_to_int = str_to_int | {
        c: n+1
        for (n, c) in enumerate(sorted(list(set(x))))
    }
    return skew_rec(np.array([str_to_int[y] for y in x]), len(str_to_int))


In [5]:
def print_sa(sa, t):
    for x in sa:
        print(t[x:]+t[0:x])
    print('-'*10)


dna_string = 'AGCTN4ACTGN'
suffix_array = get_suffix_array(dna_string)
print_sa(suffix_array, dna_string)


4ACTGNAGCTN
ACTGNAGCTN4
AGCTN4ACTGN
CTGNAGCTN4A
CTN4ACTGNAG
GCTN4ACTGNA
GNAGCTN4ACT
NAGCTN4ACTG
N4ACTGNAGCT
TGNAGCTN4AC
TN4ACTGNAGC
----------


In [6]:
def get_sort_canon_repr(s):
    """Returns cononical representation of sort by string s
    e.g. [3,1,0,2]"""
    sort_info = [None]*len(s)
    for new_place, (c, old_place) in enumerate(sorted([(c, i) for i, c
                                                       in enumerate(s)])):
        sort_info[old_place] = new_place
    return sort_info


def apply_permutation(s, perm):
    res = [None]*len(s)
    for old_place, new_place in enumerate(perm):
        res[new_place] = s[old_place]
    return res


def inverse_permutation(canon_repr):
    res = [None]*len(canon_repr)
    for old_place, new_place in enumerate(canon_repr):
        res[new_place] = old_place
    return res


ban_bwt = 'а#ннБннБаааа'
inverse_permutation(get_sort_canon_repr(ban_bwt))


[1, 4, 7, 0, 8, 9, 10, 11, 2, 3, 5, 6]

In [7]:
SHARP = '#'


class BWT:
    def encode(t: str):
        if SHARP not in t:
            raise ValueError(f"{SHARP}  is not found in text")
        bwt = [None]*len(t)
        sa = get_suffix_array(t)
        # print_sa(sa, t)
        for i in range(len(t)):
            bwt[i] = t[sa[i]-1]
        return ''.join(bwt)

    def decode(bwt: str):
        sigma = get_sort_canon_repr(bwt)
        inversed_sigma = inverse_permutation(sigma)
        res = [None]*len(bwt)
        i = bwt.index(SHARP)
        index_in_first_col = inversed_sigma[i]
        for j, c in enumerate(bwt):
            res[j] = bwt[index_in_first_col]
            index_in_first_col = inversed_sigma[index_in_first_col]
        return ''.join(res)


def _shift(alphabet, up, lo):
    for i in range(lo, up-1, -1):
        alphabet[i+1] = alphabet[i]
    return alphabet


class mtf:
    def get_alphabet():
        # return list('#Ban')
        return [chr(i) for i in range(ord('z')+1)]

    def update_alphabet(alphabet, ind, c):
        if ind > 1:
            _shift(alphabet, 1, ind-1)
            alphabet[1] = c
        if ind == 1:
            alphabet[1] = alphabet[0]
            alphabet[0] = c

    def encode(t: str):
        alphabet = mtf.get_alphabet()
        diff = set(t)-set(alphabet)
        if diff:
            raise ValueError(
                f'Found chars in text that are not presented in alphabet: {diff}')
        res = []
        for c in t:
            ind = alphabet.index(c)
            res.append(ind)
            mtf.update_alphabet(alphabet, ind, c)

        return res

    def decode(encoded):
        alphabet = mtf.get_alphabet()
        res = []
        for ind in encoded:
            c = alphabet[ind]
            res.append(c)
            mtf.update_alphabet(alphabet, ind, c)
        return ''.join(res)


class rle:
    def encode(ar):
        res = []
        cntrs = []
        prev_is_zero = False
        for x in ar:
            if x != 0:
                res.append(x)
                prev_is_zero = False
                continue
            if prev_is_zero:
                cntrs[-1] += 1
            else:
                res.append(0)
                cntrs.append(1)
            prev_is_zero = True
        # print(f"max in cntrs = {max(cntrs)}")
        return res, cntrs

    def decode(rle_encoded, cntrs):
        res = []
        for x in rle_encoded:
            if x != 0:
                res.append(x)
                continue
            res.extend([0]*cntrs.pop(0))
        return res


In [8]:
rle.decode(*rle.encode([0, 0, 0, 4, 5, 0, 0, 7, 0]))


[0, 0, 0, 4, 5, 0, 0, 7, 0]

In [9]:
mtf.encode('a#nnBnnBaaaa')


[97, 36, 110, 1, 68, 0, 0, 1, 4, 1, 0, 0]

In [10]:
eng_example = 'BanBanaxxxxxxxxnana#'  # 'БанБананана#'
mtf_encoded = mtf.encode(BWT.encode(eng_example))
mtf.decode(mtf_encoded)


'a#nnBnBnaaxaxxxxxxxa'

In [11]:
BWT.decode(mtf.decode(mtf.encode(BWT.encode(text[:10000]+SHARP))))


'  Table of Contents \n \n  About London: \n \n\t Jack London (January 12, 1876 - November 22, 1916), was an American author who wrote The Call of the Wild and other books. A pioneer in the then-burgeoning world of commercial magazine fiction, he was one of the first Americans to make a huge financial success from writing. Source: Wikipedia \n\n  Also available on Feedbooks for \t The Call of the Wild (1903)\n \t The Little Lady of the Big House (1916)\n \t White Fang (1906)\n \t The Road (1907)\n \t The Son of the Wolf (1900)\n \t Before Adam (1907)\n \t The Scarlet Plague (1912)\n \t The Game (1905)\n \t South Sea Tales (1911)\n \t The Iron Heel (1908)\n \n \n  Copyright: \n This work is available for countries where copyright is Life+70 and in the USA. \n  Note: \n This book is brought to you by Feedbooks\n http://www.feedbooks.com\n Strictly for personal use, do not use this file for commercial purposes.   Chapter 1 \n\n\t_\n \n\n\n \tI scarcely know where to begin, though I someti

In [12]:
len(text)


577875

In [13]:
part_text = text
rle_compressed, cntrs = rle.encode(mtf.encode(BWT.encode(part_text+SHARP)))
(len(rle_compressed) + len(cntrs))/len(part_text)


0.6981630975556997

Пример с нулями на выходе mtf

In [14]:
# mtf.encode(BWT.encode('BannanananananananananaBananana'+SHARP))


In [15]:
import random
import string


def random_subtext(text, length):
    i = random.randrange(0, len(text) - length - 2)
    return text[i:i+length]


def randomword(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))


def test_bwt_encode_decode():
    for i in range(1000):
        w = random_subtext(text, 1000)+SHARP
        # print(w)
        assert BWT.decode(BWT.encode(w)) == w


def test_mtf():
    for i in range(1000):
        w = random_subtext(text, 1000)+SHARP
        # print(w)
        assert mtf.decode(mtf.encode(w)) == w


def test_rle():
    for i in range(100):
        w = random_subtext(text, 1000)+SHARP
        ar = mtf.encode(BWT.encode(w))
        # print(ar)
        assert rle.decode(*rle.encode(ar)) == ar


# test_bwt_encode_decode()
# test_mtf()
test_rle()
ban = 'БанБананана#'
# BWT.decode(BWT.encode(ban))
# BWT.encode('БанБананана')


In [16]:
from itertools import permutations


def test_permutation():
    a = list('abbced')
    for perm in list(permutations(list(range(len(a)))))[:100]:
        assert a == apply_permutation(
            apply_permutation(a, perm), inverse_permutation(perm))


test_permutation()


In [17]:
# Huffman Coding in python

string = 'BCAADDDCCACACAC'


# Creating tree nodes
class NodeTree(object):

    def __init__(self, left=None, right=None):
        self.left = left
        self.right = right

    def children(self):
        return (self.left, self.right)

    def nodes(self):
        return (self.left, self.right)

    def __str__(self):
        return '%s_%s' % (self.left, self.right)


# Main function implementing huffman coding
def huffman_code_tree(node, left=True, binString=''):
    if type(node) is str:
        return {node: binString}
    (l, r) = node.children()
    d = dict()
    d.update(huffman_code_tree(l, True, binString + '0'))
    d.update(huffman_code_tree(r, False, binString + '1'))
    return d


# Calculating frequency
freq = {}
for c in string:
    if c in freq:
        freq[c] += 1
    else:
        freq[c] = 1

freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)

nodes = freq

while len(nodes) > 1:
    (key1, c1) = nodes[-1]
    (key2, c2) = nodes[-2]
    nodes = nodes[:-2]
    node = NodeTree(key1, key2)
    nodes.append((node, c1 + c2))

    nodes = sorted(nodes, key=lambda x: x[1], reverse=True)

huffmanCode = huffman_code_tree(nodes[0][0])

print(' Char | Huffman code ')
print('----------------------')
for (char, frequency) in freq:
    print(' %-4r |%12s' % (char, huffmanCode[char]))


 Char | Huffman code 
----------------------
 'C'  |           0
 'A'  |          11
 'D'  |         101
 'B'  |         100


In [18]:
huffmanCode


{'C': '0', 'B': '100', 'D': '101', 'A': '11'}

In [19]:
from adaptive import AdaptiveHuffman
# content = bytes('xx', 'utf-8')
content = bytes([12, 12, 12, 14, 15, 16])
print(f'content = {content}')
ada_huff = AdaptiveHuffman(content)
code = ada_huff.encode()
print(f"len code = {code}")
add_huff_decoder = AdaptiveHuffman(code)
add_huff_decoder.decode()


content = b'\x0c\x0c\x0c\x0e\x0f\x10'
len code = bitarray('1010000110011000001110000000111100000010000')


[12, 12, 12, 14, 15, 16]

In [20]:
class Huffman:
    def encode(l: list):
        ada_huff = AdaptiveHuffman(bytes(l))
        return ada_huff.encode()

    def decode(code: bitarray):
        add_huff_decoder = AdaptiveHuffman(code)
        return add_huff_decoder.decode()
    

def test_huffman():
    for i in range(10):
        l = [random.randint(3,9) for _ in range(10)]
        assert l == Huffman.decode(Huffman.encode(l))

test_huffman()

Archiver

In [183]:
def gamma_code(number_bits):
    number = int(''.join(map(str, number_bits)), 2)
    # number += 1  # кодируем число на 1 больше чтобы могли подавать нули
    i = math.floor(math.log(number, 2))
    return bitarray([0]*i + [int(c) for c in "{0:b}".format(number)[-(i+1):]])


def delta_code(number_bits):
    number = int(''.join(map(str, number_bits)), 2)
    i = math.floor(math.log(number, 2))
    try:
        diap = gamma_code("{0:b}".format(i+1))
    except:
        raise ValueError(i)
    return diap+bitarray([int(c) for c in "{0:b}".format(number)[-(i+1):]])


def list_int_to(l, func_code):
    res = bitarray()
    for c in l:
        res.extend(func_code("{0:b}".format(c)))
    return res

In [190]:


def gamma_to_list_int(ar: bitarray):
    j = 0
    res = []
    zeros_cnt = 0
    while j < len(ar):
        if ar[j] == 0:
            zeros_cnt += 1
            j += 1
            continue
        number_bits = ar[j:j+zeros_cnt+1]
        number = int(number_bits.to01(), 2)
        res.append(number)
        j = j + zeros_cnt+1
        zeros_cnt = 0
    return res


def list_int_to_gamma(l):
    res = bitarray()
    for c in l:
        gamma_coded = gamma_code("{0:b}".format(c))
        res.extend(gamma_coded)
    return res


def overflow_code(x, scheme):
    """scheme == [4, 8, 8] ~ 4 + 8 + 8"""
    exp = scheme[0]
    upper_bound = 2**exp - 2
    if 0 <= x <= upper_bound:
        return "{0:b}".format(x).rjust(exp, '0')
    ones_cnt = exp  # кол-во единиц перед началом числа
    for exp in scheme[1:]:
        prev_upper_bound = upper_bound
        upper_bound = upper_bound + 2**exp - 1
        if x <= upper_bound:
            return '1'*ones_cnt + "{0:b}".format(x-(prev_upper_bound+1)).rjust(exp, '0')
        ones_cnt += exp
    raise ValueError(f"Scheme {scheme} is not enough to code {x}")


def overflow_array(l):
    res = bitarray()
    for x in l:
        res.extend(overflow_code(x, [2, 4, 16]))
    return res


def get_header(x):
    return bitarray("{0:b}".format(x).rjust(20, '0'))


# gamma_code('1111111010')
# len(list_int_to_gamma(cntrs)), len(cntrs)
# # get_header(16_000)
# overflow_code(2200, [2,4,16])
# overflow_array([2,4])

def test_gamma():
    for i in range(10):
        l = [random.randint(3, 9) for _ in range(100)]
        assert l == gamma_to_list_int(list_int_to_gamma(l))

test_gamma()


In [156]:
len(global_cntrs)

72229

In [157]:
tiny_cntrs = [1]*73000
len(list_int_to(tiny_cntrs, delta_code)), len(list_int_to(tiny_cntrs, gamma_code))

(146000, 73000)

In [158]:
# rle_compressed, cntrs = rle.encode(mtf.encode(BWT.encode(part_text+SHARP)))
from functools import partial
global_cntrs = None
class Archiver:
    def encode(text):
        if '$' in text:
            raise ValueError('Text should not contain $')
        if '#' in text:
            raise ValueError('Text should not contain #')
        t = text + SHARP
        rle_compressed, cntrs = rle.encode(mtf.encode(BWT.encode(t)))
        huffman_compressed = Huffman.encode(rle_compressed)
        header = get_header(len(huffman_compressed))
        # cntrs_compressed = list_int_to_gamma(cntrs)
        global global_cntrs
        global_cntrs = cntrs
        # cntrs_compressed = list_int_to(cntrs, delta_code)
        cntrs_compressed = overflow_array(cntrs)
        # fig = px.histogram(cntrs)
        # fig.show()
        return header+huffman_compressed+cntrs_compressed
        # return mtf.encode(BWT.encode(t))

    def decode(huffman_encoded: list, cntrs):
        rle_compressed = Huffman.decode(huffman_encoded)
        mtf_encoded = rle.decode(rle_compressed, cntrs)
        return BWT.decode(mtf.decode(mtf_encoded))[:-1]


In [159]:
tiny_text = text#[:1_000]#'ababcabca'
bitar = Archiver.encode(tiny_text)
# decompressed = Archiver.decode(*)
# cntrs
(len(bitar)/8)/len(tiny_text)


0.31092212848799483

In [117]:
np.log(2*8*(1024)**2)

16.635532333438686

In [66]:
import re
max([len(x) for x in re.findall(r'0+', str(bitar))])

24

In [61]:
# re.findall(r'(01){2}', str(bitar))

In [133]:
# fig = px.histogram(cntrs)
# fig.show()
# # pd.Series(cntrs).hist()

In [134]:
pd.Series(global_cntrs).quantile(0.8)

4.0

In [163]:
max(global_cntrs)

2157