## Initializing stuff

Changes: adding word tokenize, making the "chars" vector contain words rather than actual chars. Yes, this is confusing, but hopefully if I say it right up front it won't be so bad. 

In [93]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import cmudict
import sys
import re
from sklearn.preprocessing import normalize


pronouncing_dict = cmudict.dict()
def nsyl(word):
    if word not in pronouncing_dict:
        if re.search('\W', word):
            # if the word has non-word characters
            return 0
        else:
            # take a guess: number of vowels
            return len(re.findall("[aeiou]", x))
    return [len(list(y for y in x if y[-1].isdigit())) for x in pronouncing_dict[word.lower()]][0]


# I'm making this from Trung Tran's LSTM tutorial. 
# I'm going to try to annotate it in my own words so that I can understand what's happening
# And then change it to do what I want. 

data = []
avgsyllables = 0
linecount = 0

with open("lyrics.txt", 'r') as f:
    for line in f:
        
        for word in word_tokenize(line):
            data.append(word.lower())
            avgsyllables += nsyl(word)
        linecount += 1
        data.append("\n") # I want the machine to learn newlines
        # since they're part of the lyrics
        # and have similar meaning to words. 

avgsyllables = avgsyllables / linecount
data = data[:1000]        

# features. This will eventually be words
# also, this line is what killed my results. I trained a model for a full day
# it was performing great. 
chars = sorted(list(set(data)))





## Preparing data

Only change here is the len_sequence. I made it 700 words, it'll be learning from multiple songs at once. 

In [59]:



# conversion to numbers. This is actually a really clever solution
ix_to_char = {ix:char for ix, char in enumerate(chars)}
char_to_ix = {char:ix for ix, char in enumerate(chars)}

# setting up parameters. 
num_features = len(chars)
# length of the group of words that the lstm will be shown at a time
len_sequence = 70
num_sequences = len(data)//len_sequence


# need input and output tapes for the LSTM
X = np.zeros((len(data)//len_sequence, len_sequence, num_features))
y = np.zeros((len(data)//len_sequence, len_sequence, num_features))

# for each of the sequences
for i in range(0, len(data)//len_sequence):
    # select the characters in the data that correspond to this sequence
    X_sequence = data[i*len_sequence:(i+1)*len_sequence]
    # convert to numeric
    X_sequence_ix = [char_to_ix[value] for value in X_sequence]
    #initialize 
    input_sequence = np.zeros((len_sequence, num_features))
    for j in range(len_sequence):
        #make a 1-hot vector for each of the letters the lstm is being shown
        input_sequence[j][X_sequence_ix[j]] = 1

    X[i] = input_sequence
    
    # select targets: the symbol that follows
    y_sequence = data[i*len_sequence+1:(i+1)*len_sequence+1]
    # convert to numeric
    y_sequence_ix = [char_to_ix[value] for value in y_sequence]
    target_sequence = np.zeros((len_sequence, num_features))
    for j in range(len_sequence):
        target_sequence[j][y_sequence_ix[j]] = 1
        
    y[i] = target_sequence
    
    


## Setting up network

Taking cues from Tran, I set it up to have 700 hidden states, with 0.3 dropout at the first layer and three layers

In [60]:
# initialize a sequential network
model = Sequential()

hidden_dim = 700
layer_num = 3

# add an initial lstm layer
# I don't know why the input shape needs to be a tuple
# the return-sequences parameter makes it give you multiple outputs
model.add(LSTM(hidden_dim, 
               input_shape=(None, num_features), 
               return_sequences=True))
model.add(Dropout(0.3))

# add more layers
# I don't really know what adding a layer to an LSTM means, 
#  they're only ever shown with one.

for i in range(layer_num-1):
    model.add(LSTM(hidden_dim, return_sequences=True))

# I don't see why the dense layer is necessary, but Tran says it is.
# and in order to get the dense layer to work, a time distributed layer
#  needs to go between.

model.add(TimeDistributed(Dense(num_features)))

# pick an activation for this layer
model.add(Activation('softmax'))

# pick a loss function and optimization method. 

model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

## Heuristic

This is the part where I get to be creative. 



In [119]:
def random_pick(model_predictions):
    # Select random character from the best ten
    candidates = np.argpartition(model_predictions[0,0], -10)[-10:]
    return [np.random.choice(candidates)]
    

def softmax_pick(model_predictions):
    return [np.random.choice(model_predictions.shape[2], p=model_predictions[0,0])]



def beam_search(model_predictions, history, i):
    # Select the character most likely to lead down a good path
    best_ix = 0
    candidates = np.argpartition(model_predictions[0,-1], -10)[-10:]
#     print(model_predictions.shape) (1, 1, numfeatures)
    print(np.argmax(model_predictions[0], 1))
    print(candidates)
    candidate_predictions = [model_predictions[0, -1, c] for c in candidates]
    print(candidate_predictions)
#     print(normalize(np.asarray([candidate_predictions])))
    
    h = np.zeros(len(candidates))
    
    for i in range(len(candidates)):
        candidate = candidates[i]
        pred = candidate_predictions[i]
        word = ix_to_char[candidate]
        print("candidate: {} score: {} word:{}".format(candidate, model_predictions[0,0,candidate], word))

        h[i] = heuristic(history, word, pred)
        print("adjucted score: ", h[i])
#         if score < best_score:
#             best_score = score
#             best_candidate = candidate
    print("scores: ", h)
    # softmax preserves ranking but squishes values
    # temperature lets you play with how far the values change
    # if > 1 rank 1 is further from rank 2
    # if < 1 rank 1 is closer to rank 2
    h = softmax(h, 0.4) # 0 is 
    print("softmax scores: ", h)
    return [np.random.choice(candidates, p=h)]
    
#     return [best_candidate]
#     return np.argmax(model_predictions[0], 1)

def heuristic(history, word, prediction):
    # don't give more than two newlines in a row. 
    # maximizing heuristic
    if word == '\n' and len(history) > 1 and history[-2] == '\n':
        return 1000
    
    line = []
    syl_count = 0
    for back_word in history[::-1]:
        if back_word == '\n':
            break
        else:
            syl_count = nsyl(back_word)
            line.insert(0, back_word)
    syl_count += nsyl(word)
    print("syllable count ", syl_count)
    # return square difference between number 
    # of syllables in the sentence and the average
    # this may cause a preference for long words at the beginning
    # we'll see. 
    
    return prediction * (syl_count - avgsyllables)**-2
    

# copied from http://stackoverflow.com/questions/41902047/how-to-calculate-robust-softmax-function-with-temperature?noredirect=1&lq=1
def softmax(x, tau):
    """ Returns softmax probabilities with temperature tau
        Input:  x -- 1-dimensional array
        Output: s -- 1-dimensional array
    """
    e_x = np.exp(x / tau)
    return e_x / e_x.sum()

In [133]:
x = softmax(np.asarray([5., 6., 7.]), 1)
print(x)
for i in range(10):
    x = softmax(x, 1)
    print(x)

[ 0.09003057  0.24472847  0.66524096]
[ 0.25349765  0.29590914  0.4505932 ]
[ 0.30663709  0.31992174  0.37344117]
[ 0.32441629  0.3287548   0.34682891]
[ 0.33035861  0.33179499  0.33784641]
[ 0.33234148  0.33281919  0.33483933]
[ 0.33300268  0.3331618   0.33383551]
[ 0.33322311  0.33327614  0.33350075]
[ 0.33329659  0.33331427  0.33338914]
[ 0.33332109  0.33332698  0.33335194]
[ 0.33332925  0.33333121  0.33333953]


In [135]:
model.fit(X, y, batch_size=batch_size, verbose=1, nb_epoch=1)
generate_text(model, 30)

Epoch 1/1
ix:  [221]
will [164]
[165 109 305 112 176 285 144 164 239 284]
[0.020216838, 0.024085596, 0.029399427, 0.028467134, 0.024950704, 0.021158284, 0.02549744, 0.032174785, 0.021520717, 0.026738945]
candidate: 165 score: 0.020216837525367737 word:,
syllable count  1
adjucted score:  0.00264368911049
candidate: 109 score: 0.024085596203804016 word:save
syllable count  2
adjucted score:  0.0077284365037
candidate: 305 score: 0.02939942665398121 word:thee
syllable count  2
adjucted score:  0.00943350541202
candidate: 112 score: 0.02846713364124298 word:

syllable count  1
adjucted score:  0.0037225531006
candidate: 176 score: 0.024950703606009483 word:spring
syllable count  2
adjucted score:  0.00800602679336
candidate: 285 score: 0.021158283576369286 word:went
syllable count  2
adjucted score:  0.00678913861063
candidate: 144 score: 0.0254974402487278 word:their
syllable count  2
adjucted score:  0.00818146025125
candidate: 164 score: 0.03217478469014168 word:me
syllable count  2
ad



[164 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112 112
 112 112 112 112 112 112 112 112 112]
[192  94 221  24 243 222 126  55 186 112]
[0.011305289, 0.011607205, 0.011809105, 0.015008667, 0.022505382, 0.022529777, 0.049934659, 0.025154717, 0.056874443, 0.15990965]
candidate: 192 score: 0.005790697876363993 word:''
syllable count  0
adjucted score:  0.000797386815437
candidate: 94 score: 0.0022112438455224037 word:but
syllable count  1
adjucted score:  0.00151783582721
candidate: 221 score: 0.002327405381947756 word:will
syllable count  1
adjucted score:  0.0015442376878
candidate: 24 score: 0.01619320921599865 word:of
syllable count  1
adjucted score:  0.00196263390555
candidate: 243 score: 0.006178884766995907 word:and
syllable count  1
adjucted score:  0.00294295452723
candidate: 222 score: 0.01168826874345541 word:it
syllable count  1
adjucted score:  0.00294614458916
candidate: 126 score: 0.01395866647362709 word:i
syllable count  1
adjucted score:  0.006529790550



## Text Generation

In [134]:
# begin with some random characters and predict the next n characters

def generate_text(model, length):
    # generate a number and associated character
    ix = [np.random.randint(num_features)]
    print("ix: ", ix)
    y_char = [ix_to_char[ix[-1]]]
    # annoyingly, the big matrix of character sequences is called X
    X = np.zeros((1, length, num_features))
    
    for i in range(length):
        # for n characters
        # update the big matrix with the last prediction
        X[0, i, :][ix[-1]] = 1
        # print the last prediction to the command line
        print(ix_to_char[ix[-1]], end=" ")
        # get a new prediction
        # I don't know what most of these arguments are. 
        # I don't understand why the prediction needs to be subscripted
        # I guess the 1 means that you only get one output from argmax? 
        # return np.argmax(model.predict(X[:, :i+1, :])[0], 1)

        
        # to use other methods from the heuristic section, change this line.
        ix = beam_search(model.predict(X[:, :i+1, :]), y_char, i)

        
        # convert to character and append to array
        y_char.append(ix_to_char[ix[-1]])
    
    return (' ').join(y_char)

## Training

In [15]:
# I don't know if nb is supposed to mean something. This is just a counter


if len(sys.argv) > 1:
    model.load_weights(sys.argv[1])
    print(re.findall(r'epoch(\d)', sys.argv[1]))
    nb_epoch = int(re.findall(r'epoch(\d+)', sys.argv[1])[0])
    print("Using weights in {}".format(sys.argv[1]))
    print("epoch: {}".format(nb_epoch)) 
else:
    print("Using a new model")
    nb_epoch = 0
batch_size = 40
generate_length = 100
epoch_per_gen = 10

while True:
    print("\n\n")
    # fit the model for one epoch
    model.fit(X, y, batch_size=batch_size, verbose=1, nb_epoch=epoch_per_gen)
    # increment counter
    nb_epoch += epoch_per_gen
    # every epoch, show some text examples.
    # this is a function defined below.
    generate_text(model, generate_length)
    
    if nb_epoch % epoch_per_gen*10 == 0:
        # save every tenth epoch group
        print("epoch # {}".format(nb_epoch))
        generate_text(model, generate_length)
        model.save_weights('checkpoints/checkpoint_{}_epoch{}.hdf5'.format(hidden_dim, nb_epoch))





Epoch 1/1
whispers 

NameError: name 'selct_best' is not defined