# Homework: Decipherment

In [3]:
from collections import defaultdict, Counter
import ngram
from ngram import *
import collections
import pprint
import math
import bz2
import numpy
import time
import pandas as pd
import numpy as np
pp = pprint.PrettyPrinter(width=45, compact=True)


In [4]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r') as f:
            content = f.read()
            f.close()
    return content

def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

def find_mappings(ciphertext, plaintext):
    mappings = defaultdict(dict)
    hypotheses = defaultdict(dict)
    
    for symbol in ciphertext['vocab']:
        for letter in plaintext['vocab']:
            hypotheses[symbol][letter] = abs(math.log((ciphertext['relative_freq'][symbol]/plaintext['relative_freq'][letter])))
    
    for sym in hypotheses.keys():
        winner = sorted(hypotheses[sym].items(), key=lambda kv: kv[1])
        mappings[sym] = winner[1][0]
    
    return mappings

In [5]:
cipher = read_file("data/cipher.txt")
plaintxt = read_file("data/default.wiki.txt.bz2")


cipher_desc = get_statistics(cipher, cipher=True)
plaintxt_desc = get_statistics(plaintxt, cipher=False)

mapping = find_mappings(cipher_desc, plaintxt_desc)

english_text = []
for symbol in cipher_desc['content']:
    english_text.append(mapping[symbol])
decipherment = ('').join(english_text)
#print(decipherment)
print(cipher_desc['vocab'])

['¢', 'J', '∏', '^', 'H', '—', '∞', 'O', '§', '≈', 'Q', 'M', 'π', 'I', 'S', 'V', 'G', '‘', '£', 'X', 'L', 'F', '√', '∫', '+', 'æ', 'D', 'µ', '∆', 'P', '\\', 'T', 'Ã', 'E', 'À', 'B', '–', 'Z', 'R', 'y', '/', 'W', '∑', 'u', 'A', 'j', 'Ω', 'ƒ', 'K', 'Ç', 'N', 'º', '“', '•']


In [6]:
symbol_list = []; 
symbol_relFreq = []; 
for x, y in cipher_desc["relative_freq"].items():
    symbol_list.append(x)
    symbol_relFreq.append(y)
    
index_names = {}
for i in range(54):
    index_names[i] = symbol_list[i]
    
test_data = numpy.ones((54,26))
df = pd.DataFrame(test_data, columns = plaintxt_desc['vocab'])
df=df.rename(index = index_names )


In [7]:
freq_dict=[ (k,v) for k,v in zip(cipher_desc['frequencies'].keys(),cipher_desc['frequencies'].values())]
sorted_freq_dict=sorted(freq_dict, key=lambda x:max([v[1] for v in freq_dict])-x[1])
sorted_symbols=[s[0] for s in sorted_freq_dict]

lm = ngram.LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...
Done.


In [8]:
def convert_to_bitstring(F,cipher_text) : 
    f=F[0]
    INITIAL_BS=['o' if f == t else '.' for t in cipher_text]
    for f in F[1:] :
        print(f)
        
    FINAL_BS=INITIAL_BS
    
    return "".join(FINAL_BS)
convert_to_bitstring(cipher_desc['vocab'][:3],"".join(cipher_desc['content']))

J
∏


'....................................................o................................................o.............................................................o........................................................................o...........................................................................................................................................................................'

In [9]:
def satisfy_ext_limits(phi_obj,nkeep) : 
    
   # print(phi_obj)
    l = dict([(i[0],0) for i in phi_obj])
    for elem in phi_obj : 
        l[str(elem[0])]+=1
   
    n_lengths=list(filter(lambda x:x>nkeep,list(l.values())))
   
    if n_lengths == [] : 
        return True 
    else : 
        return False
    
def score_partial_hypothesis(cipher, phi,lm) :
   
    reverse_phi= dict([(i[1],i[0]) for i in phi ])
    f_phi_list = [i[1] for i in phi]
    
    deciphered_tokens=[]
    overall_score=0
    for f in cipher : 
       
        if f in f_phi_list : 
            deciphered_tokens.append(reverse_phi[str(f)])
        else : 
            deciphered_tokens.append("_")  
        
    
    arg1 = "".join(deciphered_tokens)
    #print('arg1 = ',arg1)
    score = lm.score_seq(arg1)     
    return score

def hist_prune(H,nkeep) : 

    scores = [float(i[1]) for i in H]
    scores_s = sorted(H,key=lambda x:x[1])
    #print(scores_s)
    return scores_s[-1]


def score_beam(hs,real_phi) : 
    phi = hs[0][0]
    score=0
    for p in phi : 
        if p in real_phi : 
            score=score+1
    
    correct_ratio = score/len(phi)
    return correct_ratio

def convert_to_bitstring(f,cipher_text) : 
    return "".join(['o' if f == t else '.' for t in cipher_text])

def calculate_maximum_context_score(cipher_text,f,W) : 
    bitstring = convert_to_bitstring(f,cipher_text)
  #  print(bitstring)
    contagious_o = re.findall(r'[o]+',bitstring)
    contagious_lenghts = [len(i) for i in contagious_o]
   # print(contagious_lenghts)
    N=6
    max_context=[float(len(list(filter(lambda x:x==i,contagious_lenghts)))) for i in range(N)]
    term=np.multiply(W,max_context)
    return sum(term)
        
    
## hs=[( (A,A), (A,A) ... )]
## W=[a,a,a,a,a,a]
## cipher_text = "aaaaaa"
def sort_by_new_extension_order(cipher_text,F,W) : 
    MC_SCORE=[]
    
    for f in F : 
        
        score=calculate_maximum_context_score(cipher_text,f,W)
        MC_SCORE.append((f,score))
    
    
    return list(reversed(sorted(MC_SCORE,key=lambda x:x[1])))
    
    
def get_sorted_syms(x1,x2) : 
    freq_dict=[ (k,v) for k,v in zip(x2['frequencies'].keys(),x2['frequencies'].values())]
    sorted_freq_dict=sorted(freq_dict, key=lambda x:max([v[1] for v in freq_dict])-x[1])
    sorted_symbols=[s[0] for s in sorted_freq_dict]
    return sorted_symbols


#SAMPLE_PHI=[('', ''), ('e', '—'), ('e', 'º'), ('u', 'B'), ('v', 'R')]
#satisfy_ext_limits(SAMPLE_PHI,3)


#PHI=[('b','B')]
#score_partial_hypothesis("BURGER",PHI,lm)

#SAMPLE=[([('', ''), ('e', 'O'), ('h', 'T')], -32.71637948), ([('', ''), ('e', 'O'), ('s', 'T')], -21.396480099999998), ([('', ''), ('e', 'O'), ('o', 'T')], -39.44501403), ([('', ''), ('e', 'O'), ('j', 'T')], -33.47798432999999), ([('', ''), ('e', 'O'), ('d', 'T')], -20.987173), ([('', ''), ('e', 'O'), ('v', 'T')], -30.673667710000004), ([('', ''), ('e', 'O'), ('g', 'T')], -29.781602661000004), ([('', ''), ('e', 'O'), ('f', 'T')], -28.604963550999997), ([('', ''), ('e', 'O'), ('a', 'T')], -29.75088907), ([('', ''), ('e', 'O'), ('r', 'T')], -24.553558436), ([('', ''), ('e', 'O'), ('x', 'T')], -32.67641992), ([('', ''), ('e', 'O'), ('m', 'T')], -30.1644232), ([('', ''), ('e', 'O'), ('t', 'T')], -29.95256448), ([('', ''), ('e', 'O'), ('b', 'T')], -26.972342800000003), ([('', ''), ('e', 'O'), ('u', 'T')], -34.10374856), ([('', ''), ('e', 'O'), ('c', 'T')], -33.261454670000006), ([('', ''), ('e', 'O'), ('p', 'T')], -29.9686529), ([('', ''), ('e', 'O'), ('q', 'T')], -38.0978153), ([('', ''), ('e', 'O'), ('w', 'T')], -31.314459720000002), ([('', ''), ('e', 'O'), ('k', 'T')], -29.489249459999996), ([('', ''), ('e', 'O'), ('n', 'T')], -27.4332253), ([('', ''), ('e', 'O'), ('z', 'T')], -31.70670717), ([('', ''), ('e', 'O'), ('i', 'T')], -30.30622924), ([('', ''), ('e', 'O'), ('y', 'T')], -32.38038479), ([('', ''), ('e', 'O'), ('l', 'T')], -26.465809439999997)]
#hist_prune(SAMPLE,1)

#SAMPLE= [([('', ''), ('x', 'E'), ('i', 'T'), ('a', 'N'), ('s', 'A'), ('m', 'O'), ('e', 'R'), ('j', 'U'), ('n', 'B'), ('t', 'D'), ('o', 'P'), ('r', 'I'), ('u', 'H'), ('g', 'L'), ('f', 'S'), ('q', 'Y'), ('h', 'X'), ('y', 'G'), ('b', 'W'), ('p', 'V'), ('d', 'M')], -291.5569235510001)]

#CIPHER="".join(cipher_desc['content'])
#W=[1,1,1,1,2,3]
#print(sort_by_new_extension_order(CIPHER,cipher_desc['vocab'],W))

In [111]:
# Helper func for debugging
# Change the 'isverbose' to True to print
def printifverbose(text, isverbose=False):
    if isverbose:
        print(text)

In [112]:
def beam_search(ext_order, ext_limits,Vf,nkeep,cipher_text):
    # FOR 'BURGER' EXAMPLE:
    # ext_order: ['R', 'B', 'U', 'G', 'E']
    # Vf: ['U', 'E', 'B', 'G', 'R']
    # cipher_text: "burger"
    
    printifverbose(str("==startbeamsearch==").upper())
    
    Hs = []
    Ht = []
    # Hs and Ht will be of format '[phi, score]' 
    # which is '[list, float]'
    # [([('', '')], 0)]
    cardinality = 0
    Hs.append(([('','')],0))
    #print(Hs[0])
    Ve = plaintxt_desc['vocab']  # Ve: ['k', 'z', 'n', 's', 'o', 'q', 'd', 'c', 'i', 't', 'w', 'p', 'h', 'g', 'm', 'l', 'v', 'a', 'e', 'y', 'b', 'x', 'f', 'r', 'j', 'u']
    new_phi=[]
    
    printifverbose("len(Vf): " + str(len(Vf)))
    
    while cardinality < len(Vf):  #line5
        printifverbose("\t--beginwhile--")

        f = ext_order[cardinality]  #line6
        printifverbose("\tCurrent cipher character (f): " + f + "\n")

        # Hs is in:
        # [([('', '')], 0)]
        for h in Hs:  #line7a
            printifverbose("\t\t--beginOuterloop--")
            
            phi=h[0]  #line7b

            printifverbose("\t\tlen(Ve):" + str(len(Ve)) + "\n")
            for e in Ve:  #line8
                printifverbose("\t\t\t--beginInnerloop--")
                
                printifverbose("\t\t\tcurrent (e) --> '" + e + "'")
                
                new_eandf=(e,f)  #line9a
                printifverbose("\t\t\tcurrent (e,f) --> ('" + e + "','" + f + "')")
                
                new_phi = phi + [new_eandf] #line9b
                printifverbose("\t\t\tϕ' = ϕ ∪ {(e,f)}")
                printifverbose("\t\t\t--> " + str(new_phi))
                
                # SCORE
                if satisfy_ext_limits(new_phi,ext_limits):  #line10
                    SCORE=score_partial_hypothesis(cipher_text,new_phi,lm)  #line11a

                ht_entry=(new_phi,SCORE)  #line11b
                printifverbose("\t\t\t(ϕ', SCORE(ϕ'))")
                printifverbose("\t\t\t--> " + str(ht_entry) + "   ##Add to Ht")
        
                Ht.append((ht_entry))  #line 11c

                printifverbose("\t\t\t--endInnerloop--\n")

                
            printifverbose("\t\tHt --> " + str(Ht)) # + "\n")
    
            printifverbose("\t\t--endOuterloop--\n")
        
                
        Ht = [hist_prune(Ht,nkeep)]  #line12  ##MAKE IT A LIST
        printifverbose("\tHt after prunning --> " + str(Ht)) # + "\n")
        
        cardinality = cardinality + 1  #line13
        
        Hs=Ht  #line14
        printifverbose("\n\tHs = Ht\n\tHs --> " + str(Ht)) # + "\n")
        
        Ht=[]  #line15
    
    
        printifverbose("\t--endwhile--" + "\n")
    printifverbose("==endbeamsearch==" + "\n")
    return Hs  #WINNER(Hs)

In [116]:
## TESTING BEAM SEARCH ON SIMPLE 1:1 SUBSITUTION CIPHER

sample_text="burger"
cipher_text = sample_text.upper()

s1 = get_statistics(sample_text,cipher=False)
s2 = get_statistics(cipher_text,cipher=True)


ss = get_sorted_syms(s1,s2)
W=[1.0,1.0,1.0,1.0,2,3]
new_ss=sort_by_new_extension_order(sample_text,s2['vocab'],W)

KEEPS=1
ALPHA1="abcdefghijklmnopqrstuvwxyz"
ALPHA2="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
REAL_PHI=[(a,b) for a,b in zip(ALPHA1,ALPHA2)]

EXT_LIMIT=1
KEEPS=1

# print('ss: ')
# print(ss)
# print("s2['vocab']:")
# print(s2['vocab'])
# print("\n")

final_hs=beam_search(ss,EXT_LIMIT,s2['vocab'],KEEPS,sample_text)


In [117]:

EXT_ORDER=get_sorted_syms(plaintxt_desc,cipher_desc)
EXT_LIMITS=1
NKEEP=1
#beam_search(EXT_ORDER,EXT_LIMITS,cipher_desc['vocab'],NKEEP)

In [118]:
final_hs

[([('', ''), ('x', 'R'), ('x', 'B'), ('x', 'U'), ('x', 'G'), ('x', 'E')],
  -4.394614)]