Arithmetic Coding:

* Replace the entire input with a single floating-point number
* Does not need the probability distribution (what??..)
* Adaptive coding is very easy
* No need to keep and send codewords tables (in pre-load)
* Fractional codeword longth

In [1]:
from decimal import Decimal
from tabulate import tabulate

In [2]:
def interval_bounds(S_P):
    """
    divide interval [0,1] py probabilities P
    Input: list of probabilities P
    Output: {S: (low, high)}
    """
    S = list(S_P.keys())
    P = list(S_P.values())
    n = len(S_P)
    l = [0]

    for i in range(n):
        l.append(float(Decimal(str(l[-1])) + Decimal(str(P[i]))))
    
    return {s: (l, h) for s, l, h in zip(S,l[:-1], l[1:])}

In [3]:
def print_table(S_P):
    S_L_H = interval_bounds(S_P)
    S = [s for s, (_, _) in S_L_H.items()]
    L = [l for _, (l, _) in S_L_H.items()]
    H = [h for _, (_, h) in S_L_H.items()]
    P = list(S_P.values())

    print(tabulate({'Si':S, 'Pi':P, 'low_bound':L, 'high_bound':H}, headers="keys", tablefmt="pretty"))

In [4]:
# Example for PDF 6 slide 6:
S_P = {'A': 0.67,
        'B': 0.11,
        'C': 0.07,
        'D': 0.06,
        'E': 0.05,
        'F': 0.04}
print_table(S_P)

+----+------+-----------+------------+
| Si |  Pi  | low_bound | high_bound |
+----+------+-----------+------------+
| A  | 0.67 |     0     |    0.67    |
| B  | 0.11 |   0.67    |    0.78    |
| C  | 0.07 |   0.78    |    0.85    |
| D  | 0.06 |   0.85    |    0.91    |
| E  | 0.05 |   0.91    |    0.96    |
| F  | 0.04 |   0.96    |    1.0     |
+----+------+-----------+------------+


In [5]:
def encode(M, S_P):
    """
    Arithmetic encoding.
    Input: msg M to encode
    Output: 1) number inside the last range,
            2) table (tabulate type) of calculates with the next columns:
               - Low
               - High
               - Range
            3) msg length (for the decoder)
    """
    S_L_H = interval_bounds(S_P)
    n     = len(S_P)
    k     = len(M)
    Low   = [0.0]
    High  = [1.0]
    Range = [1.0]
    
    low = 0.0
    high = 1.0
    for i in range(k):
        r = Range[i]
        symbol = M[i]
        
        high = float(Decimal(str(low)) + Decimal(str(S_L_H[symbol][1])) * Decimal(str(r)))
        High.append(high)
        
        low = float(Decimal(str(low)) + Decimal(str(S_L_H[symbol][0])) * Decimal(str(r)))
        Low.append(low)

        Range.append(float(Decimal(str(high)) - Decimal(str(low))))

    T = tabulate({'M[i]':'-'+M, 'Low':Low, 'High':High, 'Range':Range}, headers="keys", tablefmt="pretty")
    e = float(Decimal(str(Low[-1])) + (Decimal(str(High[-1])) - Decimal(str(Low[-1])))/2)

    return e, T, k

In [6]:
def decode(e, S_P, k):
    """
    Arithmetic decoding.
    Input: encoded number e,
           Symbols with Probabilities S_P,
           k is number of characters in M.
    Output: msg M
    """
    S_L_P = interval_bounds(S_P)
    M = ""

    for _ in range(k):
        symbol = ''
        for s, (l, h) in S_L_P.items():
            if l <= e and e < h:
                symbol = s
                M += symbol
                break
        
        r = Decimal(str(S_L_P[symbol][1])) - Decimal(str(S_L_P[symbol][0]))
        e = float((Decimal(str(e)) - Decimal(str(S_L_P[symbol][0])))/r)

    return M

In [7]:
# Example for PDF 6 slide 8:
M       = "ABAAAEAABA"
S_P     = {'A': 0.67,
            'B': 0.11,
            'C': 0.07,
            'D': 0.06,
            'E': 0.05,
            'F': 0.04}
e, T, k = encode(M, S_P)
print(T)
print("encode({})={}".format(M, e))
print("decode({})={}".format(e, decode(e, S_P, k)))

+------+--------------------+---------------------+--------------------+
| M[i] |        Low         |        High         |       Range        |
+------+--------------------+---------------------+--------------------+
|  -   |        0.0         |         1.0         |        1.0         |
|  A   |        0.0         |        0.67         |        0.67        |
|  B   |       0.4489       |       0.5226        |       0.0737       |
|  A   |       0.4489       |      0.498279       |      0.049379      |
|  A   |       0.4489       |     0.48198393      |     0.03308393     |
|  A   |       0.4489       |    0.4710662331     |    0.0221662331    |
|  E   |   0.469071272121   |   0.470179583776    |   0.001108311655   |
|  A   |   0.469071272121   |  0.46981384092985   |  0.00074256880885  |
|  A   |   0.469071272121   | 0.4695687932229295  | 0.0004975211019295 |
|  B   | 0.4694046112592928 | 0.46945933858050504 | 5.472732121224e-05 |
|  A   | 0.4694046112592928 |  0.469441278564505  |

In [8]:
# Example for PDF 6 slide 11:
M       = "AAAAAAA$"
S_P     = {'A':0.9, '$':0.1}
e, T, k = encode(M, S_P)
print(T)
print("encode({})={}".format(M, e))
print("decode({})={}".format(e, decode(e, S_P, k)))

+------+------------+-----------+------------+
| M[i] |    Low     |   High    |   Range    |
+------+------------+-----------+------------+
|  -   |    0.0     |    1.0    |    1.0     |
|  A   |    0.0     |    0.9    |    0.9     |
|  A   |    0.0     |   0.81    |    0.81    |
|  A   |    0.0     |   0.729   |   0.729    |
|  A   |    0.0     |  0.6561   |   0.6561   |
|  A   |    0.0     |  0.59049  |  0.59049   |
|  A   |    0.0     | 0.531441  |  0.531441  |
|  A   |    0.0     | 0.4782969 | 0.4782969  |
|  $   | 0.43046721 | 0.4782969 | 0.04782969 |
+------+------------+-----------+------------+
encode(AAAAAAA$)=0.454382055
decode(0.454382055)=AAAAAAA$


In [9]:
# Example for PDF 6 slide 15:
M       = "BILL"
S_P     = {'B':0.25, 'I':0.25, 'L':0.5}
print_table(S_P)

e, T, k = encode(M, S_P)
print(T)
print("encode({})={}".format(M, e))
print("decode({})={}".format(e, decode(e, S_P, k)))

+----+------+-----------+------------+
| Si |  Pi  | low_bound | high_bound |
+----+------+-----------+------------+
| B  | 0.25 |     0     |    0.25    |
| I  | 0.25 |   0.25    |    0.5     |
| L  | 0.5  |    0.5    |    1.0     |
+----+------+-----------+------------+
+------+----------+-------+----------+
| M[i] |   Low    | High  |  Range   |
+------+----------+-------+----------+
|  -   |   0.0    |  1.0  |   1.0    |
|  B   |   0.0    | 0.25  |   0.25   |
|  I   |  0.0625  | 0.125 |  0.0625  |
|  L   | 0.09375  | 0.125 | 0.03125  |
|  L   | 0.109375 | 0.125 | 0.015625 |
+------+----------+-------+----------+
encode(BILL)=0.1171875
decode(0.1171875)=BILL


Adaptive Arithmetic Coding

In [10]:
def adaptive_p(s, N):
    numerator = N[s] + 1
    denominator = sum([N[s] for s in N.keys()]) + len(N)
    return numerator/denominator


def adaptive_print_level(Il, Ih, N, S_P):
    ALPHABET = sorted(N.keys())
    Interval_str = "Interval [{}, {})".format(Il, Ih)
    N_str        = "    "
    P_str        = "    "
    for s in ALPHABET:
        N_str += "N({})={} ".format(s, N[s])
    for s in ALPHABET:
        P_str += "P({})={} ".format(s, S_P[s])
    print(Interval_str)
    print(N_str)
    print(P_str)


def adaptive_encode(T):
    ALPHABET = sorted(list(set(list(T))))
    
    N = {}
    for s in ALPHABET:
        N[s] = 0
    
    S_P = {}
    for s in ALPHABET:
        S_P[s] = float(adaptive_p(s, N))
    S_L_H = interval_bounds(S_P)
    
    Il = 0.0
    Ih = 1.0
    adaptive_print_level(Il, Ih, N, S_P)
    
    for t in T:
        print("Encode {}:".format(t))
        r = (Decimal(str(Ih)) - Decimal(str(Il)))
        Ih = float(Decimal(str(Il)) + (Decimal(str(S_L_H[t][1]))) * r)
        Il = float(Decimal(str(Il)) + (Decimal(str(S_L_H[t][0]))) * r)
        
        N[t] += 1
        S_P = {}
        for s in ALPHABET:
            S_P[s] = float(adaptive_p(s, N))
        S_L_H = interval_bounds(S_P)
        adaptive_print_level(Il, Ih, N, S_P)

In [11]:
# Example for PDF 6 slide 19
adaptive_encode("bccba")

Interval [0.0, 1.0)
    N(a)=0 N(b)=0 N(c)=0 
    P(a)=0.3333333333333333 P(b)=0.3333333333333333 P(c)=0.3333333333333333 
Encode b:
Interval [0.3333333333333333, 0.6666666666666666)
    N(a)=0 N(b)=1 N(c)=0 
    P(a)=0.25 P(b)=0.5 P(c)=0.25 
Encode c:
Interval [0.5833333333333333, 0.6666666666666666)
    N(a)=0 N(b)=1 N(c)=1 
    P(a)=0.2 P(b)=0.4 P(c)=0.4 
Encode c:
Interval [0.6333333333333333, 0.6666666666666666)
    N(a)=0 N(b)=1 N(c)=2 
    P(a)=0.16666666666666666 P(b)=0.3333333333333333 P(c)=0.5 
Encode b:
Interval [0.6388888888888888, 0.6499999999999999)
    N(a)=0 N(b)=2 N(c)=2 
    P(a)=0.14285714285714285 P(b)=0.42857142857142855 P(c)=0.42857142857142855 
Encode a:
Interval [0.6388888888888888, 0.6404761904761904)
    N(a)=1 N(b)=2 N(c)=2 
    P(a)=0.25 P(b)=0.375 P(c)=0.375 
