In [1]:
from static_huff import static_huffman
from adaptive_huff import adaptive_huffman
from bitarray import bitarray

import static_huff
import adaptive_huff

## Static Huffman Showcase

In [2]:
static_huffman("abracadabra")

-----HUFFMAN TREE-----
#11
 0 -> #5 => a
 1 -> #6
  0 -> #2
   0 -> #1 => c
   1 -> #1 => d
  1 -> #4
   0 -> #2 => r
   1 -> #2 => b

In [3]:
text = "abracadabra"
encoded, root = static_huff.encode(text)
decoded = static_huff.decode(encoded, root)

print(decoded)

abracadabra


## Adaptive huffman showcase

In [4]:
adaptive_huff.adaptive_huffman(text, N = 11)

-----HUFFMAN TREE-----
#{ W=11, N=11 }
 0 -> #{ W=5, N=9 } => a
 1 -> #{ W=6, N=10 }
  0 -> #{ W=2, N=7 }
   0 -> #{ W=1, N=3 }
    0 -> #{ W=0, N=1 } => NYT
    1 -> #{ W=1, N=2 } => d
   1 -> #{ W=1, N=4 } => c
  1 -> #{ W=4, N=8 }
   0 -> #{ W=2, N=5 } => r
   1 -> #{ W=2, N=6 } => b

In [5]:
bits = adaptive_huff.encode(text)
decoded = adaptive_huff.decode(bits)

print(decoded)

abracadabra


## Generating test files

In [37]:
import string
import random
import os
import timeit
import pandas as pd

def get_size(path):
    return os.path.getsize(path) / (2 ** 10)

def rand_char():
    return random.choice(string.ascii_lowercase)

def generate_txt_file(filename, size):
    characters = size
    characters_per_line = 100
    with open(filename, "w", encoding = 'ascii') as file:
        counter = 1
        while counter < (characters - 1):
            line = ''
            while ((counter % characters_per_line) != 0) and counter < (characters - 1):
                counter += 1
                line += rand_char()    

            line += '\n'
            counter += 1
            file.write(line)


test_files = ['./resources/file1.txt', './resources/file2.txt', './resources/file3.txt', './resources/file4.txt']

compress_to_static_huff = ['./resources/file1.shf', './resources/file2.shf', './resources/file3.shf', './resources/file4.shf']

decompress_to_static_huff = ['./resources/file1_shf.txt', './resources/file2_shf.txt', './resources/file3_shf.txt', './resources/file4_shf.txt']

compress_to_adaptive_huff = ['./resources/file1.ahf', './resources/file2.ahf', './resources/file3.ahf', './resources/file4.ahf']

decompress_to_adaptive_huff = ['./resources/file1_ahf.txt', './resources/file2_ahf.txt', './resources/file3_ahf.txt', './resources/file4_ahf.txt']

for i, file in enumerate(test_files):
    generate_txt_file(file, (2 ** 10) * (10 ** i))

## Code for test suites

In [38]:
def test_suite_static_huffman():
    columns = ['Size [kB]', 'Compression Time [s]', 'Compressed Size [kB]', 'Compression Rate [%]', 'Decompression Time [s]']
    df = pd.DataFrame(columns = columns)
    for i in range(4):
        row = test_static_huff(i, columns)
        df = df.append(row, ignore_index = False)

    return df

def test_static_huff(i, columns):
    in_file = test_files[i]
    in_file_size = get_size(in_file)

    compress_to = compress_to_static_huff[i]
    decompress_to = decompress_to_static_huff[i]

    def compress_closure():
        return static_huff.compress_file(in_file, compress_to)
    
    compr_time = timeit.timeit(compress_closure, number=1)
    compr_size = get_size(compress_to)
    compr_rate = (compr_size / in_file_size) * 100

    def decompress_closure():
        return static_huff.decompress_file(compress_to, decompress_to)

    decompr_time = timeit.timeit(decompress_closure, number=1)

    data = {
        columns[0]: in_file_size,
        columns[1]: compr_time,
        columns[2]: compr_size,
        columns[3]: compr_rate,
        columns[4]: decompr_time
    }
    
    return pd.Series(data = data, name = '\u2713')

In [39]:
def test_suite_adaptive_huffman():
    columns = ['Size [kB]', 'Compression Time [s]', 'Compressed Size [kB]', 'Compression Rate [%]', 'Decompression Time [s]']
    df = pd.DataFrame(columns = columns)
    for i in range(4):
        row = test_adaptive_huff(i, columns)
        df = df.append(row, ignore_index = False)

    return df

def test_adaptive_huff(i, columns):
    in_file = test_files[i]
    in_file_size = get_size(in_file)

    compress_to = compress_to_adaptive_huff[i]
    decompress_to = decompress_to_adaptive_huff[i]

    def compress_closure():
        return adaptive_huff.compress_file(in_file, compress_to)
    
    compr_time = timeit.timeit(compress_closure, number=1)
    compr_size = get_size(compress_to)
    compr_rate = (compr_size / in_file_size) * 100

    def decompress_closure():
        return adaptive_huff.decompress_file(compress_to, decompress_to)

    decompr_time = timeit.timeit(decompress_closure, number=1)

    data = {
        columns[0]: in_file_size,
        columns[1]: compr_time,
        columns[2]: compr_size,
        columns[3]: compr_rate,
        columns[4]: decompr_time
    }
    
    return pd.Series(data = data, name = '\u2713')

## Observations & Notes
    - Binary file format descriptions are in source files. I figured it will make more sense with code beneath.
    - Difference in compression rate for static huffman test between 1kB and more
      comes from somewhat inefficient binary format (which comes from the fact that
      python allows us to write (one or more) byte chunks to file - of course there is a way to go around it, but I did not want to clutter implementation) for code table.
      This cost becomes negligable when actual encoding becomes bigger (files with sizes grater than 1kB).
    - We can see that compression rate approaches a limit of roughly 59% when files get bigger.
    - When input file contains characters with similar distribution (which is the case because I generated file by choosing random characters from alphabet),
      adaptive huffman requires a lot of swapping which adds a lot of overhead and as result running time is quite substantial.
      It would seem as reading a file twice is not that big problem. But there are other situation where such compression might be useful e.g live streaming.
    - We would expect that adaptive huffman's compression rate will be higher (because we do not have information about overall frequencies), but again
      because characters have roughly the same distribution, both static and adaptive apporach the same compressionn rate. 

In [40]:
test_suite_static_huffman()

Unnamed: 0,Size [kB],Compression Time [s],Compressed Size [kB],Compression Rate [%],Decompression Time [s]
✓,1.009766,0.015093,0.680664,67.408124,0.013746
✓,10.099609,0.016032,6.084961,60.249468,0.033462
✓,100.999023,0.057797,60.15332,59.558319,0.202704
✓,1009.999023,0.346723,601.146484,59.519511,1.658021


In [41]:
test_suite_adaptive_huffman()

Unnamed: 0,Size [kB],Compression Time [s],Compressed Size [kB],Compression Rate [%],Decompression Time [s]
✓,1.009766,0.013316,0.634766,62.862669,0.02731
✓,10.099609,0.149527,6.067383,60.075421,0.122545
✓,100.999023,1.003829,60.637695,60.037903,1.124813
✓,1009.999023,10.506341,602.500977,59.65362,11.225053
