# Let's train a LSTM to write Songs!

We're going to use [Keras](https://keras.io)  to write songs. At least 20 epochs are required before the generated text starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.


### Let's check out the corpus

In [28]:
from __future__ import print_function
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
import numpy as np
import pandas as pd
import random
import sys

# Read the entire file containing the lyrics using pandas
path = './data/songdata.csv'

df = pd.read_csv(path)
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [32]:
# I'm gonna make this lowercase to decrease the "search space" - i.e. LESS DIMENSIONS
text = df['text'].str.cat(sep='\n').lower()

# Output the length of the corpus
print('corpus length:', len(text))


# Create a sorted list of the characters
chars = sorted(list(set(text)))
print('total chars:', len(chars))

corpus length: 68056106
total chars: 50


In [33]:
df = df[df['artist']=='The Beatles']
df.head()

Unnamed: 0,artist,song,link,text
1198,The Beatles,A Shot Of Rhythm And Blues,/b/beatles/a+shot+of+rhythm+blues_20014867.html,"Well, if your hands start a-clappin' \nAnd yo..."
1199,The Beatles,Across The Universe,/b/beatles/across+the+universe_10026507.html,Words are flowing out like \nEndless rain int...
1200,The Beatles,All I've Got To Do,/b/beatles/all+ive+got+to+do_10026646.html,"Whenever I want you around, yeah \nAll I gott..."
1201,The Beatles,And I Love Her,/b/beatles/and+i+love+her_10026463.html,I give her all my love \nThat's all I do \nA...
1202,The Beatles,And Your Bird Can Sing,/b/beatles/and+your+bird+can+sing_10026364.html,You tell me that you've got everything you wan...


In [34]:
text = df['text'].str.cat(sep='\n').lower()
# Output the length of the corpus
print('corpus length:', len(text))

# Corpus is going to take tooooo long to train, so lets make it shorter
text = text[:1000000]
print('truncated corpus length:', len(text))

# Create a dictionary where given a character, you can look up the index and vice versa
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3

corpus length: 163829
truncated corpus length: 163829


## Creates the overlapping windows with target characters

In [35]:

sentences = []
next_chars = []

# Step through the text via 3 characters at a time, taking a sequence of 40 bytes at a time. 
# There will be lots ofo overlap
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(text[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Number of sequences:', len(sentences))

Number of sequences: 54597


## Generates the 1 hot vectors for each character

In [36]:

def generator(sentences, next_chars, batch_size):
    X = np.zeros((batch_size, maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((batch_size, len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    while True:
        if index + batch_size >= length:
            index = 0
        X.fill(0)
        y.fill(0)
        for i in range(batch_size):
            sentence = sentences[index]
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1
            index = index + 1
        yield X, y

        
def getdata(sentences, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    return X, y

## Build the LSTM model

In [37]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("Compiling model complete...")


Build model...
Compiling model complete...


### Helper function to sample an index from a probability array
 The purpose of this function is to add some randomness so that the most likely character is not always chosen, and sometiems the 2nd or 3rd most likely cahracter is chosen

In [67]:

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

### And now the actual training...

In [59]:
# Get data
#X, y = getdata(sentences, next_chars)

# The training
print('Training...')
batch_size = 128
history = model.fit_generator(generator(sentences, next_chars, batch_size),steps_per_epoch=12800, epochs=30)
#history = model.fit(X, y,batch_size=128, epochs=30)


# Save the model
model.save('johnlennon.h5')

Training...
Epoch 1/30
  122/12800 [..............................] - ETA: 10:30 - loss: 6.1915

KeyboardInterrupt: 

In [71]:
# Save the model
model = load_model('johnlennon.h5')



In [74]:
# Check out what our model predicts
sentence = 'behold my field of cares\nalas but there is nothing in my'
sentence = 'there is nothing like the power of a woman when shes '
#sentence = 'why is my code note'

sentence = sentence[:maxlen]
x = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
    x[0, t, char_indices[char]] = 1.
    
print(model.predict(x, verbose=0)[0])
print(sum(model.predict(x, verbose=0)[0]))

[1.4884165e-06 7.6850206e-03 3.5848150e-06 1.4582033e-04 1.7687315e-03
 3.5089717e-05 1.9100025e-04 3.8064439e-03 1.2561942e-04 1.6119558e-03
 1.1294377e-09 1.8107810e-05 8.8854955e-07 1.5116841e-08 8.5839256e-09
 9.4603367e-11 6.3125931e-08 9.8586341e-13 6.5130507e-10 4.9237553e-08
 1.9084300e-05 1.8855568e-03 5.0276304e-07 1.1195103e-06 5.2212930e-01
 5.1812157e-03 2.6993980e-06 1.8246179e-03 1.4549249e-01 2.8826424e-04
 6.7669855e-05 3.8750095e-05 2.2228459e-02 1.7167984e-05 1.5315404e-03
 2.9484904e-03 3.5788594e-03 1.4450290e-03 3.9287549e-03 2.5030723e-01
 1.6004498e-04 4.6284986e-03 4.6266429e-03 2.4903372e-03 1.5720079e-04
 2.6668311e-04 1.7646021e-03 1.1079511e-06 7.5879502e-03 6.1874275e-06]
0.9999999495452352


In [76]:
generated = ''
variance = 0.25
original = sentence
# Predict the next 400 characters based on the seed
for i in range(800):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, variance)
    next_char = indices_char[next_index]

    generated += next_char
    sentence = sentence[1:] + next_char

print(original + generated)


 want to see  
you think you want to see  
you think it time to be  
when i think it's free  
i want you mome to me  
i'm gonna be a chance to go  
i got the time is the store  
don't you always start the story  
why you can should be never make  
that you hear me to the time  
i got the time is stronger  
it's the time when i can't let me sil  
it's the only stopy stone  
and i can't see me in the same old strain  
i couldn't feel the story  
i am the time is const on  
  
i want you move to crack on my heart  
  
there's a girl with me  
you know the sun  
  
i can read your momes  
i was the time is the night  
it's all my faith and the start i know  
i want to be want to be want to see  
when i was a showerange  
and i want to see and then  
i'm a woman in the store  
don't hold you to stay  
i got the things i love you  
  
