## Laboratorium 2


### Huffman's static algorithm

In [285]:
import time
import heapq
from bitarray import bitarray
from bitarray.util import ba2int, int2ba
import numpy as np
import os

Algorytm do utworzenia drzewa dla skompresowania danych - Huffman statyczny 

In [313]:
BITS_IN_BYTE = 8
BIT = 1
ONE = bitarray("00")
TWO = bitarray("01")
THREE = bitarray("10")
FOUR = bitarray("11")

class Node():
    def __init__(self, value, char = None, parent = None,
                  left = None, right = None ):
        self.left = left
        self.right = right
        self.value = value
        self.char = char
        self.huff = ''
        self.parent = parent

    def  __lt__(self, nxt):
        return self.value < nxt.value

    def __str__(self) -> str:
        return "{} {}".format(str(self.value),str(self.huff))


class StaticHuffmanTree():

    def __init__(self, filename):
        self.root = None
        self.filename = filename
        self.text = ""
        self.char_freq = dict()
        self.prefix_code = dict()
        self.bits_to_save = bitarray("10000000")
        self.bit_counter = 0


    def createHuffmanTree(self):
        self.__readAllFile()
        self.__loadChars()
        
        Q = []
        for item in self.char_freq.items():
            heapq.heappush(Q, (item[1], Node(item[1], item[0])))

        n = len(Q)
        for _ in range(n-1):
            left = heapq.heappop(Q)
            left[1].huff = 0b0
            right = heapq.heappop(Q)
            right[1].huff = 0b1
            value = left[0] + right[0]
            z = Node(value, left=left[1], right=right[1])
            heapq.heappush(Q, (value, z))
            
        self.root = heapq.heappop(Q)[1]
        
        self.__createHuffmanCode(self.root, bitarray())



    def compressData(self, file_to_write):
        self.__saveHuffmanCode(self.root)
        for i in self.text:
            self.bits_to_save.extend(self.prefix_code[i])
        self.__lastBits(len(self.bits_to_save))
        with open(file_to_write,"wb") as f:
            self.bits_to_save.tofile(f)



    def __createHuffmanCode(self, node : Node, huff_code : bitarray):
        if node.char is not None:
            self.prefix_code[node.char] = huff_code
            return
        
        code = huff_code.copy()
        if node.left is not None:
            code.append(0)
            self.__createHuffmanCode(node.left, code)

        if node.right is not None:
            code = huff_code.copy()
            code.append(1)
            self.__createHuffmanCode(node.right, code)



    def __saveHuffmanCode(self, node: Node):
        if node.char is not None:
            self.bits_to_save.append(1)
            ba = bitarray()
            ba.frombytes(node.char.encode("utf-8"))
            utf_len = len(ba) // BITS_IN_BYTE
            match utf_len:
                case 1:
                    self.bits_to_save.extend(ONE)
                case 2:
                    self.bits_to_save.extend(TWO)
                case 3:
                    self.bits_to_save.extend(THREE)
                case 4:
                    self.bits_to_save.extend(FOUR)

            self.bits_to_save.extend(ba)
            return

        if node.left is not None:
            self.bits_to_save.append(0)
            self.__saveHuffmanCode(node.left)
            
        if node.right is not None:
            self.bits_to_save.append(0)
            self.__saveHuffmanCode(node.right)


    
    def __lastBits(self, num : int):
        # kalamarnica to ladna ryba :"DDDD
        x = int2ba(self.bits_to_save.fill())
        i = x.fill()
        x >>= i
        self.bits_to_save[:BITS_IN_BYTE] = x


    def __readAllFile(self):
        with open(self.filename, "r") as f:
            self.text = f.read()
    


    # decompressing

    def decompressData(self, file_to_write):
        data = bitarray()
        with open(self.filename, "rb") as f:
            data.fromfile(f)
            self.__restoreHuffman(data, file_to_write)
    

    def __restoreHuffman(self, data: bitarray, file_to_write):
        
        byte = self.__getNBytes(data, 1)
        last_bits = ba2int(byte)
        n = len(data) - last_bits

        self.root = Node(0)
        self.__recHuffTree(data, self.root)
        self.bit_counter -= 1
        self.prefix_code = dict()
        self.__createHuffmanCode(self.root, bitarray())
        self.__ba2Txt(data, n, file_to_write)



    def __recHuffTree(self, data: bitarray, vert: Node):
        if vert.left is not None and vert.right is not None:
            return
        bit = self.__getBit(data)
        if bit == 1:
            len_bytes = ba2int(self.__getNBit(data, 2)) + 1
            byte = self.__getNBytes(data, len_bytes)
            vert.char = bitarray(byte).tobytes().decode("utf-8")
            return

        if vert.left is None:
            new_node = Node(0)
            vert.left = new_node
            self.__recHuffTree(data, new_node)


        bit = self.__getBit(data)
        
        if vert.right is None:
            new_node = Node(0)
            vert.right = new_node
            self.__recHuffTree(data, new_node)
        else:
            len_bytes = ba2int(self.__getNBit(data, 2)) + 1
            byte = self.__getNBytes(data, len_bytes)
            vert.char = bitarray(byte).tobytes().decode("utf-8")
            return
    
    
    def __ba2Txt(self, data: bitarray, length: int, file_to_write):
        vert = self.root
        with open(file_to_write, "w") as f:
            while self.bit_counter < length:
                bit = self.__getBit(data)
                if vert.char is not None:
                    f.write(vert.char)
                    vert = self.root
                if bit == 0:
                    vert = vert.left
                else:
                    vert = vert.right

        

    def __getNBytes(self, tab : bitarray, n: int):
        x = tab[self.bit_counter : self.bit_counter + n*BITS_IN_BYTE]
        self.bit_counter += n*BITS_IN_BYTE
        return x
    


    def __getBit(self, tab: bitarray):
        x = tab[self.bit_counter : self.bit_counter + 1]
        self.bit_counter += BIT
        return x[0]
    
    def __getNBit(self, tab: bitarray, n : int):
        x = tab[self.bit_counter : self.bit_counter + n*BIT]
        self.bit_counter += n*BIT
        return x



    def __loadChars(self):
        for char in self.text:
            if char in self.char_freq:
                self.char_freq[char] += 1
            else:
                self.char_freq[char] = 1



    def __printResult(self, tmp):
        print(tmp)
        
        if tmp.left is not None:
            self.__printResult(tmp.left)
        if tmp.right is not None:
            self.__printResult(tmp.right)


    def __str__(self):
        self.__printResult(self.root)
        return ""




### Adaptive Huffman - FGK algorithm

In [287]:
class Node2:
    def __init__(self, weight=0, index=0, char=None, left=None, right=None, parent=None):
        self.weight = weight
        self.index = index
        self.char = char
        self.left = left
        self.right = right
        self.parent = parent


class AdaptiveHuffmanTree:
    def __init__(self, filename: str):
        self.filename = filename
        self.index = 520
        self.text = ""
        self.nyt = Node2(weight=0, index=self.index + 1, char='nyt')
        self.root = self.nyt
        self.bits_to_save = bitarray("10000000")
        self.bit_counter = 0
        self.leaves = {"nyt": self.root}
        self.weights = {0: {self.root}, 1: set()}

    def __insertNode(self, char: str):
        node = self.nyt
        left_node = Node2(weight=0, index=self.index - 1, parent=node, char='nyt')
        right_node = Node2(weight=1, index=self.index, parent=node, char=char)
        node.left = left_node
        node.right = right_node
        node.char = None
        self.index -= 2
        self.nyt = left_node
        self.weights[0].add(left_node)
        self.weights[1].add(right_node)
        self.leaves[char] = right_node
        self.leaves["nyt"] = left_node
        self.__processTree(node)


    def compressData(self, file_to_write):
        self.__createHuffmanAdaptiveTree()
        self.__lastBits(len(self.bits_to_save))
        # print(self.bits_to_save)

        with open(file_to_write, "wb") as f:
            self.bits_to_save.tofile(f)


    def __processTree(self, node: Node2):
        while node != self.root:
            node = node.parent
            node_max_index = max(self.weights[node.weight], key=lambda nd: nd.index)

            if node != node_max_index:
                node.index, node_max_index.index = node_max_index.index, node.index

                if node.parent == node_max_index.parent:
                    if node == node.parent.left:
                        node.parent.right = node
                        node.parent.left = node_max_index
                    else:
                        node.parent.right = node_max_index
                        node.parent.left = node

                else:
                    if node == node.parent.left:
                        node.parent.left = node_max_index
                    else:
                        node.parent.right = node_max_index

                    if node_max_index.parent.left == node_max_index:
                        node_max_index.parent.left = node
                    else:
                        node_max_index.parent.right = node

                    if node.parent != node_max_index.parent:
                        node_max_index.parent, node.parent = node.parent, node_max_index.parent

            self.weights[node.weight].remove(node)
            node.weight += 1

            if node.weight not in self.weights:
                self.weights[node.weight] = set()
                
            self.weights[node.weight].add(node)
    

    def __bottomUpHuffCode(self, char: str):
        ba = bitarray()
        node = self.leaves[char]

        while node.parent is not None:
            parent_node = node.parent
            if parent_node.right is node:
                ba.append(1)
            else:
                ba.append(0)
            node = parent_node
        ba.reverse()
        return ba
    


    def __createHuffmanAdaptiveTree(self):
        self.__readAllFile()

        for char in self.text:
            if char in self.leaves:
                self.bits_to_save.extend(self.__bottomUpHuffCode(char))
                self.__processTree(self.leaves[char])
            else:
                ba = bitarray()
                ba.frombytes(char.encode("utf-8"))
                utf_len = len(ba) // BITS_IN_BYTE

                self.bits_to_save.extend(self.__bottomUpHuffCode('nyt'))

                match utf_len:
                    case 1:
                        self.bits_to_save.extend(ONE)
                    case 2:
                        self.bits_to_save.extend(TWO)
                    case 3:
                        self.bits_to_save.extend(THREE)
                    case 4:
                        self.bits_to_save.extend(FOUR)

                self.bits_to_save.extend(ba)
                self.__insertNode(char)


    # decomressing


    def decompressData(self, file_to_write):
        data = bitarray()
        with open(self.filename, "rb") as f:
            # data = f.read()
            data.fromfile(f)
            # print(len(data))
            self.__restoreAdaptiveHuffman(data, file_to_write)    

    def __ba2Text(self, data: bitarray, length: int, file_to_write):
        node = self.root
        with open(file_to_write, "w") as f:
            while self.bit_counter < length:
                while not (node.left is None and node.right is None):
                    bit = self.__getBit(data)
                    if not bit:
                        node = node.left
                    else:
                        node = node.right

                if node.char == "nyt":
                    n = ba2int(self.__getNBit(data, 2)) + 1
                    byte = self.__getNBytes(data, n)
                    char = bitarray(byte).tobytes().decode("utf-8")
                    self.__insertNode(char)
                else:
                    char = node.char
                    self.__processTree(self.leaves[char])
                f.write(char)
                node = self.root

    def __restoreAdaptiveHuffman(self, data, file_to_write):
        byte = self.__getNBytes(data, 1)
        last_bits = ba2int(byte)
        n = len(data) - last_bits

        self.__ba2Text(data, n, file_to_write)


    def __readAllFile(self):
        with open(self.filename, "r") as f:
            self.text = f.read()

    def __getNBytes(self, tab : bitarray, n: int):
        x = tab[self.bit_counter:self.bit_counter + n*BITS_IN_BYTE]
        self.bit_counter += n*BITS_IN_BYTE
        return x
    
    

    def __getBit(self, tab: bitarray):
        x = tab[self.bit_counter: self.bit_counter + BIT]
        self.bit_counter += BIT
        return x
    

    def __getNBit(self, tab: bitarray, n : int):
        x = tab[self.bit_counter:self.bit_counter + n*BIT]
        self.bit_counter += n*BIT
        return x    


    def __lastBits(self, num: int):
        # kalamarnica to ladna ryba :"DDDD
        x = int2ba(self.bits_to_save.fill())
        i = x.fill()
        x >>= i
        self.bits_to_save[:BITS_IN_BYTE] = x

File generator

In [288]:
def fileTxtGenerator(size, filename: str):
    # size  -> number of kB to generate from ASCII code
    # filename -> name of file to save generated word
    new_size = int(1024 * size)
    with open(filename, "w") as f:
        # for i in range(new_size):
        i = 0
        while i <= new_size:
            x = chr(np.random.randint(0, 256))
            i += len(x.encode("utf-8"))
            f.write(x)
    print(os.stat(filename).st_size)


Checking % of compression

In [318]:
def compareCompressions(general_names, compressFunction):
    SIZES = ("1kB", "10kB", "100kB", "1MB")
    if str(compressFunction.__name__) == "compressStaticFile":
        print("STATIC HUFFMAN COMPRESSION")
    else:
        print("ADAPTIVE HUFFMAN COMPRESSION")
    for name in general_names:
        for size in SIZES:
            time_start = time.perf_counter()
            compressFunction(
                "input_files/{}_{}.txt".format(size, name),
                "bin_files/{}_{}.bin".format(size, name),
            )
            time_end = time.perf_counter() - time_start
            print(
                "{: >10} of {: >10} file -> {: >10} % compression  {: >10} s execute".format(
                    size,
                    name,
                    "{:.3f}".format(os.stat("bin_files/{}_{}.bin".format(size, name)).st_size
                    / os.stat("input_files/{}_{}.txt".format(size, name)).st_size * 100),
                     "{:.3f}".format(time_end),
                )
            )
    


In [290]:
def compressStaticFile(file_to_read, file_to_compress):
    huff = StaticHuffmanTree(file_to_read)
    huff.createHuffmanTree()
    huff.compressData(file_to_compress)

In [291]:
def decompressStaticFile(file_compressed, file_to_write):
    huff_dec = StaticHuffmanTree(file_compressed)
    huff_dec.decompressData(file_to_write)

In [292]:
def compressAdaptiveFile(file_to_read, file_to_compress):
    huff = AdaptiveHuffmanTree(file_to_read)
    huff.compressData(file_to_compress)

In [293]:
def decompressAdaptiveFile(file_compressed, file_to_write):
    huff_dec = AdaptiveHuffmanTree(file_compressed)
    huff_dec.decompressData(file_to_write)

In [112]:
TEXT_NAMES = ["gutenberg", "linux", "random"]

Time and percentage for compression

In [319]:
compareCompressions(TEXT_NAMES, compressStaticFile)

STATIC HUFFMAN COMPRESSION
       1kB of  gutenberg file ->     68.190 % compression       0.004 s execute
      10kB of  gutenberg file ->     59.409 % compression       0.014 s execute
     100kB of  gutenberg file ->     59.636 % compression       0.054 s execute
       1MB of  gutenberg file ->     55.247 % compression       0.248 s execute
       1kB of      linux file ->     71.942 % compression       0.003 s execute
      10kB of      linux file ->     65.458 % compression       0.003 s execute
     100kB of      linux file ->     64.220 % compression       0.023 s execute
       1MB of      linux file ->     65.331 % compression       0.246 s execute
       1kB of     random file ->    116.098 % compression       0.003 s execute
      10kB of     random file ->     71.302 % compression       0.003 s execute
     100kB of     random file ->     67.223 % compression       0.017 s execute
       1MB of     random file ->     66.672 % compression       0.160 s execute


In [321]:
compareCompressions(TEXT_NAMES, compressAdaptiveFile)


ADAPTIVE HUFFMAN COMPRESSION
       1kB of  gutenberg file ->     72.504 % compression       0.010 s execute
      10kB of  gutenberg file ->     70.811 % compression       0.067 s execute
     100kB of  gutenberg file ->     70.708 % compression       0.621 s execute
       1MB of  gutenberg file ->     67.372 % compression       6.508 s execute
       1kB of      linux file ->     76.911 % compression       0.007 s execute
      10kB of      linux file ->     73.579 % compression       0.059 s execute
     100kB of      linux file ->     72.039 % compression       0.567 s execute
       1MB of      linux file ->     72.865 % compression       7.114 s execute
       1kB of     random file ->    109.659 % compression       0.010 s execute
      10kB of     random file ->     70.647 % compression       0.059 s execute
     100kB of     random file ->     67.202 % compression       0.597 s execute
       1MB of     random file ->     66.732 % compression       6.336 s execute


Time of decompression

In [316]:
def compareDecompressionTime(general_names, compressFunction, decompressFunction):
    SIZES = ("1kB", "10kB", "100kB", "1MB")
    if str(decompressFunction.__name__) == "decompressStaticFile":
        print("STATIC HUFFMAN DECOMPRESSION")
    else:
        print("ADAPTIVE HUFFMAN DECOMPRESSION")
    for name in general_names:
        for size in SIZES:
            compressFunction(
                "input_files/{}_{}.txt".format(size, name),
                "bin_files/{}_{}.bin".format(size, name),
            )
            time_start = time.perf_counter()
            decompressFunction(
                "bin_files/{}_{}.bin".format(size, name),
                "output_files/{}_{}.txt".format(size, name),
            )
            time_end = time.perf_counter() - time_start
            print(
                "{: >10} of {: >10} file -> {: >10} s of decompression".format(
                    size,
                    name,
                    "{:.3f}".format(time_end),
                )
            )
    

In [322]:
compareDecompressionTime(TEXT_NAMES, compressStaticFile, decompressStaticFile)

STATIC HUFFMAN DECOMPRESSION
       1kB of  gutenberg file ->      0.003 s of decompression
      10kB of  gutenberg file ->      0.029 s of decompression
     100kB of  gutenberg file ->      0.190 s of decompression
       1MB of  gutenberg file ->      1.586 s of decompression
       1kB of      linux file ->      0.003 s of decompression
      10kB of      linux file ->      0.020 s of decompression
     100kB of      linux file ->      0.184 s of decompression
       1MB of      linux file ->      1.920 s of decompression
       1kB of     random file ->      0.003 s of decompression
      10kB of     random file ->      0.021 s of decompression
     100kB of     random file ->      0.202 s of decompression
       1MB of     random file ->      2.045 s of decompression


In [315]:
compareDecompressionTime(TEXT_NAMES, compressAdaptiveFile, decompressAdaptiveFile)

ADAPTIVE HUFFMAN DECOMPRESSION
       1kB of  gutenberg file ->      0.026 s of decompression
      10kB of  gutenberg file ->      0.194 s of decompression
     100kB of  gutenberg file ->      0.852 s of decompression
       1MB of  gutenberg file ->     14.163 s of decompression
       1kB of      linux file ->      0.016 s of decompression
      10kB of      linux file ->      0.093 s of decompression
     100kB of      linux file ->      0.957 s of decompression
       1MB of      linux file ->     14.971 s of decompression
       1kB of     random file ->      0.017 s of decompression
      10kB of     random file ->      0.089 s of decompression
     100kB of     random file ->      0.905 s of decompression
       1MB of     random file ->     15.747 s of decompression
