<center>
<h1><b>Information Theory</b></h1>
<h3>Lab 6: Lossless compression – Huffman coding</h3>
<h4>Maksymilian Norkiewicz 160267</h4>
<h4>Lecturer: Iwo Błądek<h4>
</center>

# Libraries

In [14]:
from heapq import heappush, heappop, heapify
from collections import defaultdict
from bitarray import bitarray
import os

# HuffmanCoding class

In [3]:
class HuffmanCoding:
    def __init__(self, text:str):
        """Constructor is necessary to create the huffman_dict."""
        self._huffman_dict = self._create_huffman_dict(text)


    def _create_huffman_dict(self, text:str) -> dict:
        # Frequency
        freq = defaultdict(int)
        for ch in text:
            freq[ch] += 1

        # Create tree
        heap = [[fq, [sym, '']] for sym, fq in freq.items()]
        heapify(heap)
        
        right = None
        left = None

        while len(heap) > 1:
            right = heappop(heap)
            left = heappop(heap)
            for pair in right[1:]:
                pair[1] = '0' + pair[1]
            for pair in left[1:]:
                pair[1] = '1' + pair[1]
            heappush(heap, [right[0] + left[0]] + right[1:] + left[1:])

        huffman_list = right[1:] + left[1:]
        huffman_dict = {a[0]:bitarray(str(a[1])) for a in huffman_list}

        return huffman_dict


    def encode_text(self, text:str) -> bitarray:
        encoded_text = bitarray()
        encoded_text.encode(self._huffman_dict, text)
        return encoded_text


    def encode_file(self, file_path:str, encoded_file_path:str):
        with open(file_path, 'r') as f:
            text = f.read()

        encoded_text = self.encode_text(text)

        # Add padding to bitarray 
        padding = 8 - (len(encoded_text) % 8)
        padding = 0 if padding == 8 else padding
        padding = bitarray(format(padding, '08b'))
        encoded_text = padding + encoded_text

        with open(encoded_file_path, 'wb') as f:
            encoded_text.tofile(f)
    

    def decode_text(self, encoded_text:bitarray) -> str:
        decoded_text = encoded_text.decode(self._huffman_dict) 
        return ''.join(decoded_text)
    

    def decode_text_from_file(self, encoded_file_path:str) -> str:
        encoded_text = bitarray()

        with open(encoded_file_path, 'rb') as f:
            encoded_text.fromfile(f)

        # Read padding and remove unnecessary bits
        padding = encoded_text[:8]
        padding = int(padding.to01(), 2)
        encoded_text = encoded_text[8:-padding]
        decoded_text = self.decode_text(encoded_text)
        
        return decoded_text


    def decode_file(self, encoded_file_path:str, decoded_file_path:str):
        decoded_text = self.decode_text_from_file(encoded_file_path)

        with open(decoded_file_path, 'w') as f:
            f.write(decoded_text)

# Enoding

In [8]:
original_file_path = "./data/norm_wiki_sample.txt"
encoded_file_path = "./data/encoded_file.bin"

In [9]:
with open(original_file_path, 'r') as f:
    text = f.read()

huffman_coding = HuffmanCoding(text)

In [16]:
print("Binary codes:")
for sign, binary_code in huffman_coding._huffman_dict.items():
    print(f"{sign}: {binary_code.to01()}")

Binary codes:
e: 000
m: 00100
y: 001010
k: 0010110
4: 001011100
x: 001011101
5: 001011110
3: 001011111
s: 0011
w: 010000
b: 010001
c: 01001
r: 0101
o: 0110
n: 0111
i: 1000
d: 10010
2: 10011000
9: 10011001
v: 1001101
g: 100111
t: 1010
p: 101100
f: 101101
l: 10111
a: 1100
h: 11010
8: 110110000
j: 110110001
0: 11011001
q: 1101101000
z: 1101101001
6: 1101101010
7: 1101101011
1: 11011011
u: 110111
 : 111


In [17]:
huffman_coding.encode_file(original_file_path, encoded_file_path)

print("{} size: {:,} bytes ({:,} bits)\n{} size: {:,} bytes ({:,} bits)".format(
    original_file_path.split("/")[-1],
    os.path.getsize(original_file_path),
    os.path.getsize(original_file_path) * 8,
    encoded_file_path.split('/')[-1],
    os.path.getsize(encoded_file_path),
    os.path.getsize(encoded_file_path) * 8
).replace(",", "\'"))

norm_wiki_sample.txt size: 10'788'941 bytes (86'311'528 bits)
encoded_file.bin size: 5'811'216 bytes (46'489'728 bits)


# Decoding

In [18]:
decoded_text = huffman_coding.decode_text_from_file(encoded_file_path)

In [19]:
text == decoded_text

True