In [15]:
from google.colab import files
def getLocalFiles():
    _files = files.upload()
    if len(_files) >0:
       for k,v in _files.items():
        open(k,'wb').write(v)
getLocalFiles()

files.upload()

{}

In [0]:
import numpy as np
import nltk
import graphviz as gv

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict
from backwards_HMM import unsupervised_HMM

In [0]:
def tokenize(filename):
  tokenizer = RegexpTokenizer('\w[\w|\'|-]*\w|\w') 

  tokens = []
  with open(filename) as f:
    for line in f:
      line = line.strip()
      if (not line.isdigit() and len(line) > 1):
        line = line.lower()
        t = tokenizer.tokenize(line)
        if len(t) > 1:
          tokens.append(t[::-1])
  return tokens

def parse_rhyme(tokens):
  rhyme = {}
  dic = cmudict.dict()
  for line in tokens:
    for word in line:
      key = ''
      
      try:
        pro = dic[word][-1]
        key = ','.join(pro[-2:])
      except (KeyError):
        pass
      
      if len(key) > 0:
        if key in rhyme.keys():
          rhyme[key].add(word)
        else:
          rhyme[key] = set()
          rhyme[key].add(word)
          
  for key, value in rhyme.items():
    rhyme[key] = list(value)
  
  return rhyme

def assign_ids(tokens):
    counter = 0
    ids_map = {}

    for line in tokens:
        for word in line:
            if word not in ids_map:
                ids_map[word] = counter
                counter += 1
    return ids_map

def generate_ids(tokens, ids_map):
    ids = []
    for line in tokens:
        line_ids = [ids_map[word] for word in line]
        ids.append(line_ids)
    return ids
        
def reverse_map(m, value):
  if value == 'id':
    ids_map_r = {}
    for key, value in m.items():
        ids_map_r[value] = key
    return ids_map_r
  else:
    rhyme_map_r = {}
    for key in m:
      for v in m[key]:
        rhyme_map_r.setdefault(v,[]).append(key)
    return rhyme_map_r
  
def write_poem(hmm, ids_map, ids_map_r, rhyme_map, rhyme_map_r, nwords, nlines=14):
    pattern = {2:0, 3:1, 6:4, 7:5, 10:8, 11:9, 13:12}
    poem = ''
    
    end_words = [''] * nlines
    
    for n in range(nlines):
      if n in pattern:
        end = rhyme_map_r[end_words[pattern[n]]][0]
        rhymes = rhyme_map[end]
        end_words[n] = np.random.choice(rhymes)
      else:
        end_words[n] = np.random.choice(list(rhyme_map_r.keys()))
        
    for n in range(nlines):
        line_ids, states = hmm.generate_emission(nwords, ids_map[end_words[n]])
        words = [ids_map_r[i] for i in line_ids]
        poem += ' '.join(words).capitalize()
        
        if ((n + 1) % 4 == 0) or (n == 13):
          poem += '.\n'
        else:
          poem += ',\n'
          
    return poem

def make_graph(matrix):

    graph = gv.Digraph(format='png')
    n, m = matrix.shape
    
    for i in range(n):
      graph.node(str(i))
    
    rows, cols = np.where(matrix >= 0.01)    
    rows = map(str, rows.tolist())
    cols = map(str, cols.tolist())
    
    edges = zip(rows, cols)
    weighted_edges = zip(edges, map(lambda x: "%0.2f" % (x), matrix[rows, cols]))
    
    for edge in weighted_edges:
      if isinstance(e[0], tuple):
        graph.edge(*(e[0] + (e[1],)))
      else:
        graph.edge(e)
    return graph

In [0]:
tokens = tokenize('shakespeare.txt')

ids_map = assign_ids(tokens)
ids = generate_ids(tokens, ids_map)
ids_map_r = reverse_map(ids_map, value='id')

rhyme_map = parse_rhyme(tokens)
rhyme_map_r = reverse_map(rhyme_map, value='rhyme')

In [0]:
model = unsupervised_HMM(ids, 25, 1000)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200
Iteration: 210
Iteration: 220
Iteration: 230
Iteration: 240
Iteration: 250
Iteration: 260
Iteration: 270
Iteration: 280
Iteration: 290
Iteration: 300
Iteration: 310
Iteration: 320
Iteration: 330
Iteration: 340
Iteration: 350
Iteration: 360
Iteration: 370
Iteration: 380
Iteration: 390
Iteration: 400
Iteration: 410
Iteration: 420
Iteration: 430
Iteration: 440
Iteration: 450
Iteration: 460


In [0]:
poem = write_poem(model, ids_map, ids_map_r, rhyme_map, rhyme_map_r, 8)
print(poem)

In [0]:
O = np.array(hmm.O)

for i in range(len(O)):
    top10 = O[i].argsort()[-10:][::-1]
    print("State " + str(i))
    for j in top10:
        print(ids_map_r[j] + ", ", end="")
    print("\n")

In [0]:
A = np.array(hmm.A)
create_graph(A).render('naive_hmm')