In [55]:
def create_prob_model(data):
    prob = defaultdict(int) # Every new key is given a default value of 0
    for char in data:
        prob[char] += 1 / len(data)  
    return prob

In [32]:
from heapq import heappush, heappop, heapify
from collections import defaultdict

In [56]:
def huff_encode(data):
    
    prob = create_prob_model(data)
    
    hp = [[freq, [sym, ""]] for sym, freq in prob.items()]
    heapify(hp)
    while len(hp) > 1:
        qu = heappop(hp)
        hi = heappop(hp)
        for pair in qu[1:]:
            pair[1] = '0' + pair[1]
        for pair in hi[1:]:
            pair[1] = '1' + pair[1]
        heappush(hp, [qu[0] + hi[0]] + qu[1:] + hi[1:])
    symbol_code_pairs = sorted(heappop(hp)[1:], key=lambda p: (len(p[-1]), p))
    
    source_code = {}
    for pair in symbol_code_pairs:
        symbol = pair[0]
        codeword = pair[1]
        freq = prob[symbol]
        source_code[symbol] = (freq, codeword)
    
    encoded = ''
    for symbol in data:
        encoded += source_code[symbol][1]
    
    return source_code, encoded

## Task 1

In [57]:
filename=input("Filename: ")
with open (filename) as myfile:
    data=myfile.read()

Filename: Words.txt


## Task 3

In [58]:
source_code, encoded = huff_encode(data)

print("TXT file:{}".format(data))

print("Encoded txt: {}\n".format(encoded))

TXT file:Real Madrid moved top of La Liga with victory over Barcelona in an El Clasico played in torrential rain at the Alfredo Di Stefano Stadium.
Karim Benzema broke the deadlock with a sublime near-post flick - his ninth goal in his past seven La Liga games.
Real doubled their lead before half-time through Toni Kroos' deflected free-kick.
Oscar Mingueza pulled one back for Barca before Madrid's Casemiro was shown a late red card.
Barca midfielder Ilaix Moriba's well-struck shot came back off the crossbar in stoppage time as the hosts held on.The result takes Real above city rivals Atletico Madrid because of their superior head-to-head record.
Atletico, who led La Liga by 10 points as recently as 31 January, can reclaim top spot with victory at Real Betis on Sunday.
Encoded txt: 1110010100011111110111000111111111101100100011110110110100101011010101110001011011001010110100010110011010111111010001111111110100011101111110011111111011100000111010110000110101011101111001101010110010000110

## Task 3

In [46]:
print("Symbol Frequency Codeword")
for symbol, pair in sorted(source_code.items()):
    print("{0:>6} {1:>9,.2f} {2:>8}".format(symbol, pair[0], pair[1]))

Symbol Frequency Codeword
     
      0.01  0011011
            0.16      110
     '      0.00 10001100
     ,      0.00 101110010
     -      0.01  1011101
     .      0.01  1010110
     0      0.00 1011100110
     1      0.00 111000100
     3      0.00 1011100111
     A      0.00 10001101
     B      0.01  0011110
     C      0.00 111000101
     D      0.00 1110001100
     E      0.00 1110001101
     I      0.00 1110001110
     J      0.00 1110001111
     K      0.00 111001000
     L      0.01  1000111
     M      0.01  0011111
     O      0.00 1110010010
     R      0.01 11100101
     S      0.00 10111000
     T      0.00 00110100
     a      0.09     1111
     b      0.02   100100
     c      0.03    10011
     d      0.04    10110
     e      0.09      000
     f      0.02   101111
     g      0.01  1110011
     h      0.03    10000
     i      0.06     0111
     k      0.01   001110
     l      0.04    11101
     m      0.02   100101
     n      0.03    10100
     o      0.06    

In [79]:
import lzma
def lzma_compression_ratio(test_string):
    c = lzma.LZMACompressor()
    bytes_in = bytes(test_string,'utf-8')
    bytes_out = c.compress(bytes_in)
    return len(bytes_out)/len(bytes_in)
compression_ratio = lzma_compression_ratio(data)
bytes_in = bytes(data,'utf-8')
bytes_com=bytes(encoded,'utf-8')
print('Number of bits in the original text:',len(bytes_in))
print('Compression ratio =',compression_ratio)

Number of bits in the original text: 769
Compression ratio = 0.031209362808842653
