Unary Code
* The simble x is represented as x-1 "1" bits, followed by a single "0" bit.

In [1]:
from tabulate import tabulate
from math import log2, ceil

In [2]:
def Unary_Code(i):
    """
    Return the unary coade for a simbol s in given index i
    Note: 1 <= i
    """
    return "1" * (i - 1) + "0"

In [3]:
# Unary Code Example

n = 9
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
C                = [Unary_Code(i) for i in range(1, n+1)]
print(tabulate({'Symbol Index':ALPHABET_Indexes, 'Unary Code':C}, headers="keys", tablefmt="pretty"))

+--------------+------------+
| Symbol Index | Unary Code |
+--------------+------------+
|      1       |     0      |
|      2       |     10     |
|      3       |    110     |
|      4       |    1110    |
|      5       |   11110    |
|      6       |   111110   |
|      7       |  1111110   |
|      8       |  11111110  |
|      9       | 111111110  |
+--------------+------------+


Binary Code:
* Simple Binary Code - every symbol is assigned a codeword of exactly UP[logs(n)] bits
* Minimal Binary Code - for an alphabet of n symbols, a minimal binary code contains 2^(DOWN[logs(n)])-n codewords that are DOWN[log2(n)] bits long and the remaining 2n-2^(UP[log2(n)]) are UP[logs(n)] bits long.

In [4]:
def Simple_Binary_Code(i, n):
    """
    Return the simple binary code for a symbol s by it's
    index i, and by given alphabet size n.
    Encoding: every symbol is assigned a codeword
               of exactly ceil(log2(n)) bits
    Note: 1 <= i
    """
    # codewords start from 0, but 1<=i.
    # So, the encoder takes the index i but encoding by i-1 binary value string.
    return bin(i-1)[2:].zfill(ceil(log2(n)))

In [5]:
# Simple Binary Code Example

n = 9
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
C                = [Simple_Binary_Code(i, n) for i in range(1, n+1)]
print(tabulate({'Symbol Index':ALPHABET_Indexes, 'Simple Binary Code':C}, headers="keys", tablefmt="pretty"))

+--------------+--------------------+
| Symbol Index | Simple Binary Code |
+--------------+--------------------+
|      1       |        0000        |
|      2       |        0001        |
|      3       |        0010        |
|      4       |        0011        |
|      5       |        0100        |
|      6       |        0101        |
|      7       |        0110        |
|      8       |        0111        |
|      9       |        1000        |
+--------------+--------------------+


In [6]:
def Minimal_Binary_Code(i, n):
    """
    Return the minimal binary code for a simbol s by it's
    index i, and by given alphabet size n.
    Encoding: the first [2^(ceil(log2(n)))-n] symbols are
              assigned a codewords of exaclty int(log2(n)) bits.
              and the rest codewords with ceil(log2(n)) bits per codeword.
    Note: 1 <= i
    """
    max_index_for_short   = 2**(int(ceil(log2(n)))) - n
    num_of_bits_for_short = int(int(log2(n)))
    num_of_bits_for_long  = int(ceil(log2(n)))

    if (i <= max_index_for_short):
        return bin(i - 1)[2:].zfill(num_of_bits_for_short)
    else: # to get the right codeword (prefix-free), omit all the leafs was cancels by shorter codewords and start coding from the first leaf wasn't canceld.
        return bin((2*max_index_for_short - 1) + (i - max_index_for_short))[2:].zfill(num_of_bits_for_long)

In [7]:
# Minimal Binary Code Example

n = 9
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
C                = [Minimal_Binary_Code(i, n) for i in range(1, n+1)]
print(tabulate({'Symbol Index':ALPHABET_Indexes, 'Minimal Binary Code':C}, headers="keys", tablefmt="pretty"))

+--------------+---------------------+
| Symbol Index | Minimal Binary Code |
+--------------+---------------------+
|      1       |         000         |
|      2       |         001         |
|      3       |         010         |
|      4       |         011         |
|      5       |         100         |
|      6       |         101         |
|      7       |         110         |
|      8       |        1110         |
|      9       |        1111         |
+--------------+---------------------+


Elias Codes:
* C-gama - 
    * First part: unary code for the number of bits in x
    * Second part: a binary code for x within the range established by the unary part
* C-delta - 
    * First part: C-game code fro the number of bits in x
    * Second part: a binary code for x within the range established by the C-gama part 

In [8]:
def C_gama(i):
    """
    Return the C-gama Elias code for a simbol s by it's index i.
    Encoding:
        first part  - Unary Code for the number of bits in i
        second part - a binary code for i within the range
                      established by the unary part
    Note: 1 <= i
    Note2: Omit the first bit in the second part
    Note3: For readability (just for this example) keep space (' ')
           between the two parts of the code.
    """
    return Unary_Code(1 + int(log2(i))) + ' ' + bin(i)[3:]

In [9]:
# Elias C-gama Code Example

n = 9
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
C                = [C_gama(i) for i in range(1, n+1)]
print(tabulate({'Symbol Index':ALPHABET_Indexes, 'Elias C-gama Code':C}, headers="keys", tablefmt="pretty"))

+--------------+-------------------+
| Symbol Index | Elias C-gama Code |
+--------------+-------------------+
|      1       |         0         |
|      2       |       10 0        |
|      3       |       10 1        |
|      4       |      110 00       |
|      5       |      110 01       |
|      6       |      110 10       |
|      7       |      110 11       |
|      8       |     1110 000      |
|      9       |     1110 001      |
+--------------+-------------------+


In [10]:
def C_delta(i):
    """
    Return the C-delta Elias code for a simbol s by it's index i.
    Encoding:
        first part  - C-gama Code for the number of bits in i
        second part - a binary code for i within the range
                      established by the unary part
    Note: 1 <= i
    Note2: Omit the first bit in the second part
    Note3: For readability (just for this example) keep space (' ')
           between the two parts of the code.
    Note4: using C_gama function, so dont forget remove the space char
           from the result from it.
    """
    return C_gama(1 + int(log2(i))).replace(' ', '') + ' ' + bin(i)[3:]

In [11]:
# Elias C-delta Code Example

n = 9
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
C                = [C_delta(i) for i in range(1, n+1)]
print(tabulate({'Symbol Index':ALPHABET_Indexes, 'Elias C-delta Code':C}, headers="keys", tablefmt="pretty"))

+--------------+--------------------+
| Symbol Index | Elias C-delta Code |
+--------------+--------------------+
|      1       |         0          |
|      2       |       100 0        |
|      3       |       100 1        |
|      4       |       101 00       |
|      5       |       101 01       |
|      6       |       101 10       |
|      7       |       101 11       |
|      8       |     11000 000      |
|      9       |     11000 001      |
+--------------+--------------------+


In [12]:
# A summary example:
n = 30
ALPHABET_Indexes = [i for i in range(1, n+1)] # indexes starting from 1
Unary_C          = [Unary_Code(i) for i in range(1, n+1)]
Simple_Binary_C  = [Simple_Binary_Code(i, n) for i in range(1, n+1)]
Minimal_Binary_C = [Minimal_Binary_Code(i, n) for i in range(1, n+1)]
C_gama_C         = [C_gama(i) for i in range(1, n+1)]
C_delta_C        = [C_delta(i) for i in range(1, n+1)]

print(tabulate({'Symbol Index':ALPHABET_Indexes,\
                'Unary Code':Unary_C,
                'Simple Binary Code':Simple_Binary_C,\
                'Minimal Binary Code':Minimal_Binary_C,\
                'C-gama Code':C_gama_C,\
                'C-delta Code':C_delta_C},\
                headers="keys", tablefmt="pretty"))

+--------------+--------------------------------+--------------------+---------------------+-------------+--------------+
| Symbol Index |           Unary Code           | Simple Binary Code | Minimal Binary Code | C-gama Code | C-delta Code |
+--------------+--------------------------------+--------------------+---------------------+-------------+--------------+
|      1       |               0                |       00000        |        0000         |      0      |      0       |
|      2       |               10               |       00001        |        0001         |    10 0     |    100 0     |
|      3       |              110               |       00010        |        00100        |    10 1     |    100 1     |
|      4       |              1110              |       00011        |        00101        |   110 00    |    101 00    |
|      5       |             11110              |       00100        |        00110        |   110 01    |    101 01    |
|      6       |        

Shanon-Fano Algoritm:
* Arrange the character set in order of decreasing probability
* While a probability class contains more than one symbol:
    * Divide the probability class in two
        * so that the probabilities in the two halves are as nearly as possible equal
    * Assign '1' to the first probability class, and '0' to the second.

In [13]:
from decimal import Decimal

def shannon_fano_recursion(li):
    left = []
    right = []

    gap = sum(Decimal(str(p)) for _, p, _ in li)

    # Divide it to two lists. assume |li| > 1
    s, p, c = li[0]
    while gap > Decimal(str(p)):
        left.append((s,p,c+'0'))
        gap -= 2*Decimal(str(p))
        
        # prepare the next
        li = li[1:]
        s, p, c = li[0]
    
    # all the remaining goes to the right
    for (l, p, c) in li:
        right.append((l,p,c+'1'))

    if len(left) == 1 and len(right) == 1:
        return left + right
    elif len(left) == 1:
        return left + shannon_fano_recursion(right)
    elif len(right) == 1:
        return right + shannon_fano_recursion(left)
    else:
        return shannon_fano_recursion(left) + shannon_fano_recursion(right)



def shannon_fano(S, P):
    # make the initial list with the structure [(simbol, probablity, codeword)]
    codewords = [''] * len(S)
    l = list(zip(S, P, codewords))

    # sort by probability before get inside the algorithm
    sorted(l, key=lambda triple: triple[1], reverse=True)
    
    # return the list [(simbol, probablity, codeword)] with full codewords.
    return shannon_fano_recursion(l)

In [14]:
# Function from lecture 1

def ECL(P, C):
    """
    return the E(P, C)
    """
    n = len(C)
    return float(sum([Decimal(str(P[i])) * Decimal(str(len(C[i]))) for i in range(n)]))

In [15]:
# Shannon-Fano Example from PDF 3 slide 22

S = ['a', 'b', 'c', 'd', 'e', 'f']
P = [0.67, 0.11, 0.07, 0.06, 0.05, 0.04]

# get S and P again to check no issues.
s_p_c = shannon_fano(S, P)
S = [s for s, _, _ in s_p_c]
P = [p for _, p, _ in s_p_c]
C = [c for _, _, c in s_p_c]

print(tabulate({'simbol':S, 'prob':P, 'codeword':C}, headers="keys", tablefmt="pretty"))
print("E(P, C) = {}".format(ECL(P, C)))

+--------+------+----------+
| simbol | prob | codeword |
+--------+------+----------+
|   a    | 0.67 |    0     |
|   b    | 0.11 |   100    |
|   c    | 0.07 |   101    |
|   d    | 0.06 |   110    |
|   e    | 0.05 |   1110   |
|   f    | 0.04 |   1111   |
+--------+------+----------+
E(P, C) = 1.75


In [16]:
# Shannon-Fano Example from PDF 3 slide 23

S = ['a', 'b', 'c', 'd', 'e', 'f', 'G']
P = [0.4, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

# get S and P again to check no issues.
s_p_c = shannon_fano(S, P)
S = [s for s, _, _ in s_p_c]
P = [p for _, p, _ in s_p_c]
C = [c for _, _, c in s_p_c]

print(tabulate({'simbol':S, 'prob':P, 'codeword':C}, headers="keys", tablefmt="pretty"))
print("E(P, C) = {}".format(ECL(P, C)))

+--------+------+----------+
| simbol | prob | codeword |
+--------+------+----------+
|   a    | 0.4  |    00    |
|   b    | 0.1  |    01    |
|   c    | 0.1  |   100    |
|   d    | 0.1  |   101    |
|   e    | 0.1  |   110    |
|   f    | 0.1  |   1110   |
|   G    | 0.1  |   1111   |
+--------+------+----------+
E(P, C) = 2.7
