Generate Poetry using Robert Frost poems

Utilities: https://github.com/lazyprogrammer/machine_learning_examples/blob/master/rnn_class/util.py
    
Code: https://github.com/lazyprogrammer/machine_learning_examples/blob/master/rnn_class/srn_language.py    
    

In [1]:
import numpy as np
import string
from sklearn.utils import shuffle

import theano
import theano.tensor as T

import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
#Some utilities

def init_weights(Mi, Mo):
    return np.random.randn(Mi, Mo)/ np.sqrt(Mi + Mo) #provides a smaller range (-1, 1)

def remove_punctuation(s):
    return s.translate(None, string.punctuation) #stack overflow. removes all symbols/punctuation

def get_robert_frost():
    word2idx = {'START': 0, 'END': 1} #initilized a dictionary, defines a 'start' and 'end' token
    # recall word2idx['START'] returns '0'
    current_idx = 2 #will begin after initial 'START' and 'END'
    sentences = [] #will return an array of arrays (sentences of sentences)
    for line in open('RFrost.txt'):
        line = line.strip() #takes out double space. makes single space
        if line: #sure why not
            tokens = remove_punctuation(line.lower()).split() #split turns each line into an array of words.
            sentence = [] 
            for t in tokens:
                if t not in word2idx:
                    word2idx[t] = current_idx #adds new word and new value
                    current_idx += 1 #accumulates everytime we add a new token to the word2idx map
                idx = word2idx[t] #recall 'idx' is the value where 't' is the key. idx will be a numerical value 
                #associated with the word
                sentence.append(idx)
            sentences.append(sentence)
    return sentences, word2idx   

#sentences will be a unique numerical value for each word. An array (corpus) of arrays (sentences)
#word2idx will be a dictionary of every word and its associated value


In [None]:
class SimpleRNN:
    def __init__(self, D, M, V):
        self.D = D #dimensionality of word embedding
        self.M = M #number of hidden units
        self.V = V #vocabulary size
        
    #fit function only takes in X. unsupervised learning
    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):    
        N = len(X)
        D = self.D #set like this so easier to access
        M = self.M
        V = self.V
        self.f = activation
        
        #initialize the weights
        We = init_weights(V, D) #word embedding
        Wx = init_weights(D, M)
        Wh = init_weights(M, M)
        
        bh = np.zeros(M)
        h0 = np.zeros(M)
        
        Wo = init_weights(M, V)
        bo = np.zeros(V)
        
        #make all weights into theano shared variables
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        #collect all parameters to make it easy to do gradient descenst
        self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]
        
        thX = T.ivector('X') #define the X. sequence of indexes
        Ei = self.We[thx] #'real' X. This is the word embedding indexed by those indicies. A TxD matrix
        #D is size of word embeddings. T is length of sequence
        thY = T.ivector('Y') #targets
        
        
        def recurrence(x_t, h_t1): #x(t) & h(t-1)
            #returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) #recall structure. see notebook
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

In [39]:
init_weights(3, 4)

array([[-0.67008123, -0.07274935, -0.27521247,  0.76064143],
       [ 0.29566258, -0.07360387,  0.01879118, -0.43411675],
       [ 0.03693915,  0.05450141, -0.3737843 , -0.18517446]])

In [22]:
for line in open('RFrost.txt'):
    print line

Two roads diverged in a yellow wood,

And sorry I could not travel both

And be one traveler, long I stood

And looked down one as far as I could

To where it bent in the undergrowth; 



Then took the other, as just as fair,

And having perhaps the better claim

Because it was grassy and wanted wear,

Though as for that the passing there

Had worn them really about the same,



And both that morning equally lay

In leaves no step had trodden black.

Oh, I kept the first for another day! 

Yet knowing how way leads on to way

I doubted if I should ever come back.



I shall be telling this with a sigh

Somewhere ages and ages hence:

Two roads diverged in a wood, and I,

I took the one less traveled by,

And that has made all the difference.



Whose woods these are I think I know.

His house is in the village, though; 

He will not see me stopping here

To watch his woods fill up with snow.



My little horse must think it queer

To stop without a farmhouse near

Between the woods and

In [4]:
for line in open('RFrost.txt'):
    line = line.strip()
    #print line
    tokens = remove_punctuation(line.lower()).split()
    print tokens
    #for t in tokens:
        #print t

['two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood']
['and', 'sorry', 'i', 'could', 'not', 'travel', 'both']
['and', 'be', 'one', 'traveler', 'long', 'i', 'stood']
['and', 'looked', 'down', 'one', 'as', 'far', 'as', 'i', 'could']
['to', 'where', 'it', 'bent', 'in', 'the', 'undergrowth']
[]
['then', 'took', 'the', 'other', 'as', 'just', 'as', 'fair']
['and', 'having', 'perhaps', 'the', 'better', 'claim']
['because', 'it', 'was', 'grassy', 'and', 'wanted', 'wear']
['though', 'as', 'for', 'that', 'the', 'passing', 'there']
['had', 'worn', 'them', 'really', 'about', 'the', 'same']
[]
['and', 'both', 'that', 'morning', 'equally', 'lay']
['in', 'leaves', 'no', 'step', 'had', 'trodden', 'black']
['oh', 'i', 'kept', 'the', 'first', 'for', 'another', 'day']
['yet', 'knowing', 'how', 'way', 'leads', 'on', 'to', 'way']
['i', 'doubted', 'if', 'i', 'should', 'ever', 'come', 'back']
[]
['i', 'shall', 'be', 'telling', 'this', 'with', 'a', 'sigh']
['somewhere', 'ages', 'and', 'ages', 'hence']
['t

In [46]:
line

''

In [23]:
sentance = "What, did you say?"

In [24]:
sentance

'What, did you say?'

In [26]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
sentance.translate('257', string.punctuation)

ValueError: translation table must be 256 characters long

In [13]:
url = 'https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt'

In [8]:
def remove_punctuation(s):
    return s.translate(None, string.punctuation)

In [5]:
def get_robert_frost():
    word2idx = {'START': 0, 'END': 1}
    current_idx = 2
    sentences = []
    for line in open('RFrost.txt'):
        line = line.strip()
        if line:
            tokens = remove_punctuation(line.lower()).split()
            sentence = []
            for t in tokens:
                if t not in word2idx:
                    word2idx[t] = current_idx
                    current_idx += 1
                idx = word2idx[t]
                sentence.append(idx)
            sentences.append(sentence)
    return sentences, word2idx

In [6]:
get_robert_frost()

([[2, 3, 4, 5, 6, 7, 8],
  [9, 10, 11, 12, 13, 14, 15],
  [9, 16, 17, 18, 19, 11, 20],
  [9, 21, 22, 17, 23, 24, 23, 11, 12],
  [25, 26, 27, 28, 5, 29, 30],
  [31, 32, 29, 33, 23, 34, 23, 35],
  [9, 36, 37, 29, 38, 39],
  [40, 27, 41, 42, 9, 43, 44],
  [45, 23, 46, 47, 29, 48, 49],
  [50, 51, 52, 53, 54, 29, 55],
  [9, 15, 47, 56, 57, 58],
  [5, 59, 60, 61, 50, 62, 63],
  [64, 11, 65, 29, 66, 46, 67, 68],
  [69, 70, 71, 72, 73, 74, 25, 72],
  [11, 75, 76, 11, 77, 78, 79, 80],
  [11, 81, 16, 82, 83, 84, 6, 85],
  [86, 87, 9, 87, 88],
  [2, 3, 4, 5, 6, 8, 9, 11],
  [11, 32, 29, 17, 89, 90, 91],
  [9, 47, 92, 93, 94, 29, 95],
  [96, 97, 98, 99, 11, 100, 11, 101],
  [102, 103, 104, 5, 29, 105, 45],
  [106, 107, 13, 108, 109, 110, 111],
  [25, 112, 102, 97, 113, 114, 84, 115],
  [116, 117, 118, 119, 100, 27, 120],
  [25, 121, 122, 6, 123, 124],
  [125, 29, 97, 9, 126, 127],
  [29, 128, 129, 130, 29, 131],
  [106, 132, 102, 133, 134, 6, 135],
  [25, 136, 76, 49, 104, 137, 138],
  [29, 139, 3

In [55]:
word2idx = {'START': 0, 'END': 1}

In [58]:
word2idx['somthing'] = 5

In [59]:
word2idx

{'END': 1, 'START': 0, 'somthing': 5}

In [67]:
some = word2idx['somthing']
some

5

In [68]:
word2idx['START']

0

In [7]:
X, Y = get_robert_frost()

In [63]:
print X

[[2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15], [9, 16, 17, 18, 19, 11, 20], [9, 21, 22, 17, 23, 24, 23, 11, 12], [25, 26, 27, 28, 5, 29, 30], [31, 32, 29, 33, 23, 34, 23, 35], [9, 36, 37, 29, 38, 39], [40, 27, 41, 42, 9, 43, 44], [45, 23, 46, 47, 29, 48, 49], [50, 51, 52, 53, 54, 29, 55], [9, 15, 47, 56, 57, 58], [5, 59, 60, 61, 50, 62, 63], [64, 11, 65, 29, 66, 46, 67, 68], [69, 70, 71, 72, 73, 74, 25, 72], [11, 75, 76, 11, 77, 78, 79, 80], [11, 81, 16, 82, 83, 84, 6, 85], [86, 87, 9, 87, 88], [2, 3, 4, 5, 6, 8, 9, 11], [11, 32, 29, 17, 89, 90, 91], [9, 47, 92, 93, 94, 29, 95], [96, 97, 98, 99, 11, 100, 11, 101], [102, 103, 104, 5, 29, 105, 45], [106, 107, 13, 108, 109, 110, 111], [25, 112, 102, 97, 113, 114, 84, 115], [116, 117, 118, 119, 100, 27, 120], [25, 121, 122, 6, 123, 124], [125, 29, 97, 9, 126, 127], [29, 128, 129, 130, 29, 131], [106, 132, 102, 133, 134, 6, 135], [25, 136, 76, 49, 104, 137, 138], [29, 139, 33, 140, 29, 141], [130, 142, 143, 9, 144, 145], [29, 97, 99, 