In [59]:
import heapq
import collections


def get_rate(compressed_binary, uncompressed_bits):
    return (uncompressed_bits-len(compressed_binary)) * 100 / uncompressed_bits

class HuffmanCompression:
    class Trie:
        def __init__(self, val, char=''):
            self.val = val
            self.char = char
            self.coding = ''
            self.left = self.right = None

        def __eq__(self, other):
            return self.val == other.val

        def __lt__(self, other):
            return self.val < other.val

        def __gt__(self, other):
            return self.val > other.val

    def __init__(self, string):
        self.string = string
        counter = collections.Counter(string)
        heap = []
        for char, cnt in counter.items():
            heapq.heappush(heap, HuffmanCompression.Trie(cnt, char))

        while len(heap) != 1:
            left = heapq.heappop(heap)
            right = heapq.heappop(heap)
            trie = HuffmanCompression.Trie(left.val + right.val)
            trie.left, trie.right = left, right
            heapq.heappush(heap, trie)

        self.root = heap[0]
        self.s2b = {}
        self.bfs_encode(self.root, self.s2b)

    def bfs_encode(self, root, s2b):
        queue = collections.deque()
        queue.append(root)
        while queue:
            node = queue.popleft()
            if node.char:
                s2b[node.char] = node.coding
                continue
            if node.left:
                node.left.coding = node.coding + '0'
                queue.append(node.left)
            if node.right:
                node.right.coding = node.coding + '1'
                queue.append(node.right)

    def compress(self):
        bits = ''
        for char in self.string:
            bits += self.s2b[char]
        return bits

    def uncompress(self, bits):
        string = ''
        root = self.root
        for bit in bits:
            if bit == '0':
                root = root.left
            else:
                root = root.right
            if root.char:
                string += root.char
                root = self.root
        return string


def huff_compress(string):
    s = string
    # ASCII
    bits = len(s) * 8
    print('Total bits: %d' % bits)
    binary=''.join('{0:08b}'.format(ord(x), 'b') for x in s)
    print('orignal size: ',len(binary))
    #print('orignal_binary: ',binary,'\n\n')
    # huffman compression
    hc = HuffmanCompression(s)
    h_compressed = hc.compress()
    print('compressed size: ',len(h_compressed))
    #print('Compressed binary: ' + h_compressed +'\n\n')
    #print('Uncompressed: ' + hc.uncompress(h_compressed))
    #print(hc.s2b)
    #print('Huffman Compression-compress rate: %d%%' % get_rate(h_compressed, bits))
    return h_compressed

In [60]:
#data = open('compression_data_3.txt',encoding="ISO-8859-1").read()
data = open('compression_data_6.txt',encoding='ISO-8859-1').read()
#data=data.replace(' ', '_')
#data=data.replace('',' # ')
data.rstrip()
print(len(data))
print(data[:1000])

3082579
One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.

"What's happened to me?" he thought. It wasn't a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm 

In [61]:
h_compressed=huff_compress(data)
file=open('huffman_compress.txt','w')
file.write(h_compressed)
file.close()

Total bits: 24660632
orignal size:  24660632
compressed size:  13273945


In [62]:
def compress(uncompressed):
    """Compress a string to a list of output symbols."""
 
    # Build the dictionary.
    dict_size = 256
    #dictionary = dict((chr(i), i) for i in range(dict_size))
    # in Python 3:
    dictionary = {chr(i): i for i in range(dict_size)}
 
    w = ""
    result = []
    for c in uncompressed:
        wc = w + c
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            # Add wc to the dictionary.
            dictionary[wc] = dict_size
            dict_size += 1
            w = c
 
    # Output the code for w.
    if w:
        result.append(dictionary[w])
    return result
 
 
def decompress(compressed):
    """Decompress a list of output ks to a string."""
    try:
        from cStringIO import StringIO
    except:
        from io import StringIO
    # Build the dictionary.
    dict_size = 256
    #dictionary = dict((i, chr(i)) for i in xrange(dict_size))
    # in Python 3: 
    dictionary = {i: chr(i) for i in range(dict_size)}
 
    # use StringIO, otherwise this becomes O(N^2)
    # due to string concatenation in a loop
    result = StringIO()
    w = chr(compressed.pop(0))
    result.write(w)
    for k in compressed:
        if k in dictionary:
            entry = dictionary[k]
        elif k == dict_size:
            entry = w + w[0]
        else:
            raise ValueError('Bad compressed k: %s' % k)
        result.write(entry)
 
        # Add w+entry[0] to the dictionary.
        dictionary[dict_size] = w + entry[0]
        dict_size += 1
 
        w = entry
    return result.getvalue()
 
 
 # How to use:
compressed = compress(data)
#compressed_directly=compress(data)
#print (compressed)
#decompressed = decompress(compressed)
#print (decompressed)

In [63]:
kii=' '.join((str(key) for key in compressed))

In [64]:
listkii=kii.split(' ')
listkii=[int(code) for code in listkii]
listkii==compressed

True

In [65]:
print(len(kii))
print(len(data))
#kii

2462001
3082579


In [66]:
file=open('lzw_compression.txt','w')
file.write(kii)
file.close()

In [67]:
dkii=decompress(listkii)
len(dkii)

3082579

In [23]:
mapping={}
for i in range(len(compressed)):
    if compressed[i] not in mapping:
        mapping[compressed[i]]=[i]
    else:
        mapping[compressed[i]].append(i)
#mapping

In [24]:
h_c=list(h_compressed)
#h_c=list('111001110')
#h_c

In [25]:
new=[]
i=0
while i in range(len(h_c)):
    #print(i)
    count=1
    j=i
    while(i<len(h_c)-1 and h_c[i]==h_c[i+1]):
        #print(i)
        count+=1
        i+=1
    if count>2:
        new.append(h_c[i])
        new.append(str(count))
    elif count==2:
        new.append(h_c[i-1])
        new.append(h_c[i])
    else:
        new.append(h_c[i])
    i+=1
#print(new)

In [26]:
old=[]
for i in range(len(new)):
    if int(new[i])!=0 and int(new[i])!=1:
        for j in range(int(new[i])-1):
            old.append(new[i-1])
    else:
        old.append(new[i])
#print(old)

In [224]:
print(len(h_c),len(new),len(old))

510530 365507 510530


In [225]:
#print(''.join(new))

In [226]:
#print(''.join(h_c))

In [227]:
hc.uncompress(h_compressed)

't_ieg_tinele_hh___oyn_r;whinko_y,n,boa.sdei_i_\neufetwk_yeoe.T__so_hlre_kra.wows_een,,rmtwkle_rae_ct__Wlbe_k;eLweshtn,nihl_gso_inh__Hrae_\nehay\nwhgei.sh!ips_\nT__wh_so_iwh_et__yiv_oes_n_tefwesysttygw_\ntorae_ceutu_hetanrr_\n.ohbwer_sutwk_eT__sLtoyt__her_neoi_ivikHn_eohrkettcti\nf_t_h_oreokyiyl_t\ne_sneo_ih__ehweslboidoeTou\nT__etotwk\nwesh__ake.oysuhrkn,r_pTnn_nl_gsm_pe,r_twhey___ywnoh_htoer_\n_W_ttyuynewk_ta\n__\n_o\nat_tok,r_n_o\nml_,rraps__onhtt__nhHhlrtth_oenihnr_ne_so_iuwtwylguouolotr\nlo_n_snrdlegk_ak.tdoolrt_\n_udhbel_g_,kt\neut__u\n_sewe.l_gsm_eilps_\n_rnHyie_kdtie_\nilp_ethvygn__n_tefwesl,ke_into,__peh__swetadrower_hefoin_lhekduckh\n.oiweekt\ntoewu_t_o_tthantt\noH_,r_ntd,etnse\npsoenkti,_\n_yewes_udulb\n_o_ae\ntw_od_o_y,n,bone_\n_\nenhttaniyl\n\ntmpebtrkeT__nhIgolhkn_ieor_hTl_e_rnttlcw_n_nlrt_kes_ewoWlih__ohrvyrvhr_ebwith\nyeWet__peUokia,snT__kT\noo_le__e_,oows_reoisok_hnihnubt_uTw_Ttt__n_tiorkrifet__udu_own__n_dreT_nl_eT__dreT\nue_gkv,,kdseeohb,__nienin_hke_ytegh__h_hekthgd

In [228]:
file=open('huffman_compress_comp.txt','w')
file.write(''.join(new))
file.close()