Terminology:
  * Source alphabet S = [s1, s2, s3, ..., sn]

  * Probabiliy      P = [p1, p2, p3, ..., pn]
    * Prob(si) == pi for each 1 <= i <= n
    * SUM_i=1->n(pi) = 1
    * p1 >= p2 >= p3 >= ... >= pn

  * Codewords       C = [c1, c2, c3, ..., cn]
    * Codewords cost |C| = [|c1|, |c2|, |c3|, ..., |cn|]
    * Encode(si) = ci for each 1 <= i <= n

  * Expected codeword length E(C, P) = SUM_i=1->n(pi * |ci|)

In [1]:
from decimal import Decimal
from tabulate import tabulate

In [2]:
def ECL(P, C):
    """
    return the E(P, C)
    """
    n = len(C)
    return float(sum([Decimal(str(P[i])) * Decimal(str(len(C[i]))) for i in range(n)]))


In [3]:
# Example from PDF 1 slide 21 and 23

headers = ['si', 'p1', 'Code 1', 'Code 2']
table  = [['a' , 0.67, '000'   , '00'],
            ['b' , 0.11, '001'   , '01'],
            ['c' , 0.07, '010'   , '100'],
            ['d' , 0.06, '011'   , '101'],
            ['e' , 0.05, '100'   , '110'],
            ['f' , 0.04, '101'   , '111']]
n = 6
P  = [table[i][1] for i in range(n)]
C1 = [table[i][2] for i in range(n)]
C2 = [table[i][3] for i in range(n)]

table.append(['Expected length', '',ECL(P, C1), ECL(P, C2)])

print(tabulate(table, headers=headers, tablefmt="pretty"))

+-----------------+------+--------+--------+
|       si        |  p1  | Code 1 | Code 2 |
+-----------------+------+--------+--------+
|        a        | 0.67 |  000   |   00   |
|        b        | 0.11 |  001   |   01   |
|        c        | 0.07 |  010   |  100   |
|        d        | 0.06 |  011   |  101   |
|        e        | 0.05 |  100   |  110   |
|        f        | 0.04 |  101   |  111   |
| Expected length |      |  3.0   |  2.22  |
+-----------------+------+--------+--------+


In [4]:
# Example from PDF 1 slide 12

headers = ['si',  'p1', 'Code 1', 'Code 2', 'Code 3', 'Code 4']
table  = [['a' ,   0.5, '0'     , '0'     , '0'     , '0'   ],
            ['b' ,  0.25, '0'     , '1'     , '10'    , '01'  ],
            ['c' , 0.125, '1'     , '00'    , '110'   , '011' ],
            ['d' , 0.125, '10'    , '11'    , '111'   , '0111']]
n = 4
P  = [table[i][1] for i in range(n)]
C1 = [table[i][2] for i in range(n)]
C2 = [table[i][3] for i in range(n)]
C3 = [table[i][4] for i in range(n)]
C4 = [table[i][5] for i in range(n)]

table.append(['Expected length', '',ECL(P, C1), ECL(P, C2), ECL(P, C3), ECL(P, C4)])

print(tabulate(table, headers=headers, tablefmt="pretty"))

+-----------------+-------+--------+--------+--------+--------+
|       si        |  p1   | Code 1 | Code 2 | Code 3 | Code 4 |
+-----------------+-------+--------+--------+--------+--------+
|        a        |  0.5  |   0    |   0    |   0    |   0    |
|        b        | 0.25  |   0    |   1    |   10   |   01   |
|        c        | 0.125 |   1    |   00   |  110   |  011   |
|        d        | 0.125 |   10   |   11   |  111   |  0111  |
| Expected length |       | 1.125  |  1.25  |  1.75  | 1.875  |
+-----------------+-------+--------+--------+--------+--------+


In [5]:
# Example from PDF 1 slide 20

headers = ['si', 'p1', 'Code 3']
table  = [['a' , 0.67, '0'   ],
            ['b' , 0.11, '100' ],
            ['c' , 0.07, '101' ],
            ['d' , 0.06, '110' ],
            ['e' , 0.05, '1110'],
            ['f' , 0.04, '1111']]
n = 6
P  = [table[i][1] for i in range(n)]
C3 = [table[i][2] for i in range(n)]

table.append(['Expected length', '',ECL(P, C3)])

print(tabulate(table, headers=headers, tablefmt="pretty"))

+-----------------+------+--------+
|       si        |  p1  | Code 3 |
+-----------------+------+--------+
|        a        | 0.67 |   0    |
|        b        | 0.11 |  100   |
|        c        | 0.07 |  101   |
|        d        | 0.06 |  110   |
|        e        | 0.05 |  1110  |
|        f        | 0.04 |  1111  |
| Expected length |      |  1.75  |
+-----------------+------+--------+



Terminology:
* UD Code is any given sequence of codewords can be
    decoded in a single way

* Let 'a' and 'b' be two binary codewords where |'a'|=k
    bits and |'b'|=n bits, k<n. If the first k bits of 'b'
    are identical to 'a' then 'a' is called a PREFFIX of
    'b'. The last n - k bits are called the DANGLING SUFFIX.

    * Example: a=010, b=01011, dangling-sufffix=11

* UD Test is an algorithm to test if given C is a UD Code
    * Examine all pairs of codewords:
    1. Construct a list of all codewords.
    2. If there exist a codeword, 'a', witch is a preffix
        of another codeword, 'b', add the dangling suffix
        to the list (if it is not there already), until:
        I. You get a dangling suffix that is an ORIGINAL
            codeword --> the code is not UD
        II. There are no more unique dangling suffix -->
            the code is UD

In [6]:
def dangling_suffix(c1, c2):
    """
    Return the dangling suffix of c1 and c2. (order does matter)
    """
    if c1 == c2[:len(c1)]:
        return c2[len(c1):]
    else:
        return None



def left_quotient(S, T):
    """
    Return the left quotient group of two groups (order does matter)
    """
    ret = set()

    for s in S:
        for t in T:
            dangling = dangling_suffix(s,t)
            if dangling:
                ret.add(dangling)
    
    return list(ret)



def UD_test(C):
    """
    Run the Sardinas-Patterson algorithm for UD test
    Return: True if the given codeword list C is UD,
            or False if C isn't UD
    """
    print("List: ", C)
    S = [None] # S[1], S[2]... are dangling list (from index 1, not 0)

    S.append(left_quotient(C, C))
    i = 1
    print("List: ", S[i])
    while True:
        S.append(list(set(left_quotient(C,S[i]) + left_quotient(S[i], C))))
        i += 1
        print("List: ", S[i])

        for c in C:
            if c in S[i]:
                return False
        
        if set(S[i]) == set(S[i-1]):
            return True

In [7]:
# Example from PDF 1 slide 17

C = ['0', '01', '11']
if UD_test(C):
    print("UD")
else:
    print("Not UD")

List:  ['0', '01', '11']
List:  ['1']
List:  ['1']
UD


In [8]:
# Example from PDF 1 slide 18

C = ['0', '01', '10']
if UD_test(C):
    print("UD")
else:
    print("Not UD")

List:  ['0', '01', '10']
List:  ['1']
List:  ['0']
Not UD


In [9]:
# Example of student in the class

C = ['01', '10', '0', '11']
if UD_test(C):
    print("UD")
else:
    print("Not UD")

List:  ['01', '10', '0', '11']
List:  ['1']
List:  ['0', '1']
Not UD


Terminology:
* Shannon at 1948: The amount of information contained in
    a symbol si of probaility pi is:
    I(si) = -log2(pi)
    * A code shuold be able to be devised such that the
    codeword for si contains I(si) bits

* Entropy is the average of the information per symbol.
    * Given a probability distribution P With alphabet size
    n, define:
    H(P) = -SUM_i=1->n(pi * log2(pi))
    H(P) = SUM_i=1->n(pi * I(si))
    * For all unambiguous codes C:
    H(P) <= E(P, C)

* Kraft inequality for code C is:
    K(C) = SUM_i=1->n(2^(-|ci|))

In [10]:
from math import log2

In [11]:
def I(p):
    """
    return the Information content of symbol s
    with probability p. It means that the codeword
    for s, shuold be represented with [-log2(p)]
    bits only for the best encode
    """
    return -log2(p)



def H(P):
    """
    return the entropy for given distribution P
    """
    return float(sum([Decimal(str(p)) * Decimal(str(I(p))) for p in P]))



def K(C):
    """
    return the kraft of given code C
    """
    return float(sum([Decimal(str(2**(-len(c)))) for c in C]))

In [12]:
# Example from PDF 1 slide 21 and 23

ALPHABET    = ['a' ,'b'  ,'c'  ,'d'  ,'e'  ,'f'  ]
Probability = [0.67, 0.11, 0.07, 0.06, 0.05, 0.04]
Information = [float("{:0.2f}".format(I(p))) for p in Probability]

print(tabulate({'si':ALPHABET, 'pi':Probability, 'I(si)':Information}, headers="keys", tablefmt="pretty"))
print("H(P)={:0.2f}".format(H(Probability)))

+----+------+-------+
| si |  pi  | I(si) |
+----+------+-------+
| a  | 0.67 | 0.58  |
| b  | 0.11 | 3.18  |
| c  | 0.07 | 3.84  |
| d  | 0.06 | 4.06  |
| e  | 0.05 | 4.32  |
| f  | 0.04 | 4.64  |
+----+------+-------+
H(P)=1.65


In [13]:
# Example from PDF 1 slide 27

headers = ['si', 'p1', 'Code 1', 'Code 2']
table  = [['a' , 0.67, '000'   , '00'],
            ['b' , 0.11, '001'   , '01'],
            ['c' , 0.07, '010'   , '100'],
            ['d' , 0.06, '011'   , '101'],
            ['e' , 0.05, '100'   , '110'],
            ['f' , 0.04, '101'   , '111']]
n = 6
P  = [table[i][1] for i in range(n)]
C1 = [table[i][2] for i in range(n)]
C2 = [table[i][3] for i in range(n)]

table.append(['Expected length', '',ECL(P, C1), ECL(P, C2)])

print(tabulate(table, headers=headers, tablefmt="pretty"))
print("Code 1: K(C)={0}".format(K(C1)))
print("Code 2: K(C)={0}".format(K(C2)))

+-----------------+------+--------+--------+
|       si        |  p1  | Code 1 | Code 2 |
+-----------------+------+--------+--------+
|        a        | 0.67 |  000   |   00   |
|        b        | 0.11 |  001   |   01   |
|        c        | 0.07 |  010   |  100   |
|        d        | 0.06 |  011   |  101   |
|        e        | 0.05 |  100   |  110   |
|        f        | 0.04 |  101   |  111   |
| Expected length |      |  3.0   |  2.22  |
+-----------------+------+--------+--------+
Code 1: K(C)=0.75
Code 2: K(C)=1.0
