# Homework: Decipherment

In [26]:
from collections import defaultdict, Counter
import ngram
from ngram import *
import collections
import pprint
import math
import bz2
import numpy
import time
import pandas as pd
import numpy as np
pp = pprint.PrettyPrinter(width=45, compact=True)


In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r') as f:
            content = f.read()
            f.close()
    return content

def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

def find_mappings(ciphertext, plaintext):
    mappings = defaultdict(dict)
    hypotheses = defaultdict(dict)
    
    for symbol in ciphertext['vocab']:
        for letter in plaintext['vocab']:
            hypotheses[symbol][letter] = abs(math.log((ciphertext['relative_freq'][symbol]/plaintext['relative_freq'][letter])))
    
    for sym in hypotheses.keys():
        winner = sorted(hypotheses[sym].items(), key=lambda kv: kv[1])
        mappings[sym] = winner[1][0]
    
    return mappings

In [3]:
cipher = read_file("data/cipher.txt")
plaintxt = read_file("data/default.wiki.txt.bz2")


cipher_desc = get_statistics(cipher, cipher=True)
plaintxt_desc = get_statistics(plaintxt, cipher=False)

mapping = find_mappings(cipher_desc, plaintxt_desc)

english_text = []
for symbol in cipher_desc['content']:
    english_text.append(mapping[symbol])
decipherment = ('').join(english_text)
print(decipherment)

dmmbgbuumgbubbgbugggububgdgmbyyluugbubumgvlbbbyubggbkbduumbugumvuylggbgggbbbybbgggugubglbbdymgglgggkbkubbmugybglbubybuugbbmuubglgggubuugbgylbmgglyggggbduumbugxbgybkuguggbgbbbggmggbggybuggmdbugbubybubbgbygmggguubmggbggygbbbmybggdgggybgggkmkubggduuggyggbbbmbbbbyvuugvbkbmmmgbggbbgbdmmgvgmuugbuglglgbugbgbdgdumbbguubggbulgbblgggubuyggbugdmugbggybugdkggbbyvgyblgubuugugmbugybmbbgbbbblggbmgbumygggggbdgmglggggbumg


In [4]:
symbol_list = []; 
symbol_relFreq = []; 
for x, y in cipher_desc["relative_freq"].items():
    symbol_list.append(x)
    symbol_relFreq.append(y)
    
index_names = {}
for i in range(54):
    index_names[i] = symbol_list[i]
    
test_data = numpy.ones((54,26))
df = pd.DataFrame(test_data, columns = plaintxt_desc['vocab'])
df=df.rename(index = index_names )


In [120]:
def satisfy_ext_limits(phi_obj,nkeep) : 
    
    l = dict([(i[0],0) for i in phi_obj])
    for elem in phi_obj : 
        l[str(elem[0])]+=1
   
    n_lengths=list(filter(lambda x:x>nkeep,list(l.values())))
   
    if n_lengths == [] : 
        return True 
    else : 
        return False
    
def score_partial_hypothesis(cipher, phi,lm) :
    
    reverse_phi= dict([(i[1],i[0]) for i in phi ])
    f_phi_list = [i[1] for i in phi]
    
    deciphered_tokens=[]
    overall_score=0
    
    for f in cipher : 
       
        if f in f_phi_list : 
            deciphered_tokens.append(reverse_phi[str(f)])
        else : 
            deciphered_tokens.append("_")  
        
    
    arg1 = "".join(deciphered_tokens)
    #print('arg1 = ',arg1)
    score = lm.score_seq(arg1)     
    return score

def hist_prune(H,top_n) : 
    
    scores = [float(i[1]) for i in H]
    scores_s = sorted(H,key=lambda x:x[1])
    return scores_s[-top_n:]



    
#SAMPLE_PHI=[('', ''), ('e', '—'), ('e', 'º'), ('u', 'B'), ('v', 'R')]
#satisfy_ext_limits(SAMPLE_PHI,3)


#PHI=[('b','B')]
#score_partial_hypothesis("BURGER",PHI,lm)

#SAMPLE=[([('', ''), ('e', 'O'), ('h', 'T')], -32.71637948), ([('', ''), ('e', 'O'), ('s', 'T')], -21.396480099999998), ([('', ''), ('e', 'O'), ('o', 'T')], -39.44501403), ([('', ''), ('e', 'O'), ('j', 'T')], -33.47798432999999), ([('', ''), ('e', 'O'), ('d', 'T')], -20.987173), ([('', ''), ('e', 'O'), ('v', 'T')], -30.673667710000004), ([('', ''), ('e', 'O'), ('g', 'T')], -29.781602661000004), ([('', ''), ('e', 'O'), ('f', 'T')], -28.604963550999997), ([('', ''), ('e', 'O'), ('a', 'T')], -29.75088907), ([('', ''), ('e', 'O'), ('r', 'T')], -24.553558436), ([('', ''), ('e', 'O'), ('x', 'T')], -32.67641992), ([('', ''), ('e', 'O'), ('m', 'T')], -30.1644232), ([('', ''), ('e', 'O'), ('t', 'T')], -29.95256448), ([('', ''), ('e', 'O'), ('b', 'T')], -26.972342800000003), ([('', ''), ('e', 'O'), ('u', 'T')], -34.10374856), ([('', ''), ('e', 'O'), ('c', 'T')], -33.261454670000006), ([('', ''), ('e', 'O'), ('p', 'T')], -29.9686529), ([('', ''), ('e', 'O'), ('q', 'T')], -38.0978153), ([('', ''), ('e', 'O'), ('w', 'T')], -31.314459720000002), ([('', ''), ('e', 'O'), ('k', 'T')], -29.489249459999996), ([('', ''), ('e', 'O'), ('n', 'T')], -27.4332253), ([('', ''), ('e', 'O'), ('z', 'T')], -31.70670717), ([('', ''), ('e', 'O'), ('i', 'T')], -30.30622924), ([('', ''), ('e', 'O'), ('y', 'T')], -32.38038479), ([('', ''), ('e', 'O'), ('l', 'T')], -26.465809439999997)]
#hist_prune(SAMPLE,3)

In [6]:
freq_dict=[ (k,v) for k,v in zip(cipher_desc['frequencies'].keys(),cipher_desc['frequencies'].values())]
sorted_freq_dict=sorted(freq_dict, key=lambda x:max([v[1] for v in freq_dict])-x[1])
sorted_symbols=[s[0] for s in sorted_freq_dict]

lm = ngram.LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...
Done.


In [130]:
def beam_search(ext_order, ext_limits,Vf,nkeep):
    Hs = []
    Ht = []
    cardinality = 0
    Hs.append(([('','')],0))
    Ve = plaintxt_desc['vocab']
    new_phi=[]
    while cardinality < len(Vf) - 1:
        f = ext_order[cardinality]
        print("Hs = ",Hs)
        print("Ht = ",Ht)
        for h in Hs:
            phi=h[0]
            for e in Ve:
                for p in phi : 
                    new_phi.append(p)
                new_phi.append((e,f))
                print('newphi={}'.format(new_phi))
                if satisfy_ext_limits(new_phi,ext_limits):
                    SCORE=score_partial_hypothesis("".join(cipher_desc['content']),new_phi,lm)
                    Ht.append((new_phi,SCORE))
                new_phi=[]
            
        Ht = hist_prune(Ht,nkeep)
        Hs=Ht
        Ht=[]
        cardinality = cardinality + 1
    return Hs

In [131]:
## TESTING BEAM SEARCH ON SIMPLE 1:1 SUBSITUTION CIPHER

sample_text="thescoreestimationfunctionneedstopredicthowgoodorbadapartialhypothesis"
cipher_text = sample_text.upper()

s1 = get_statistics(sample_text,cipher=False)
s2 = get_statistics(cipher_text,cipher=True)

def get_sorted_syms(x1,x2) : 
    freq_dict=[ (k,v) for k,v in zip(x2['frequencies'].keys(),x2['frequencies'].values())]
    sorted_freq_dict=sorted(freq_dict, key=lambda x:max([v[1] for v in freq_dict])-x[1])
    sorted_symbols=[s[0] for s in sorted_freq_dict]
    return sorted_symbols

#beam_search(sorted_symbols, 1,cipher_desc['vocab'])
ss = get_sorted_syms(s1,s2)

beam_search(ss,1,s2['vocab'],1)


Hs =  [([('', '')], 0)]
Ht =  []
Hs =  [([('', ''), ('h', 'O'), ('s', 'O'), ('o', 'O'), ('j', 'O'), ('d', 'O'), ('v', 'O'), ('g', 'O'), ('f', 'O'), ('a', 'O'), ('r', 'O'), ('x', 'O'), ('m', 'O'), ('t', 'O'), ('b', 'O'), ('e', 'O'), ('u', 'O'), ('c', 'O'), ('p', 'O'), ('q', 'O'), ('w', 'O'), ('k', 'O'), ('n', 'O'), ('z', 'O'), ('i', 'O'), ('y', 'O'), ('l', 'O')], -8.53472662)]
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []
Hs =  []
Ht =  []


[]

In [33]:
scores=[float(i[1]) for i in HT]
scores

[-5412.5280160000175,
 -4964.34780800002,
 -4858.411599999999,
 -9907.329568000006,
 -5650.728688000006,
 -7652.8057119999785,
 -6658.504223999993,
 -6543.461887999999,
 -4584.423407999982,
 -4960.800608000018,
 -10063.048272,
 -6322.216527999989,
 -4557.870319999997,
 -7034.856336000015,
 -4084.861689599994,
 -6332.424063999991,
 -5977.461696000011,
 -6671.305472000022,
 -11104.128079999973,
 -6887.577728000008,
 -8201.991504000032,
 -4812.038864,
 -10781.95116799998,
 -4826.4549760000045,
 -7083.239311999987,
 -5648.251311999994]