# Beatles Song Lyrics Generator

## This is a script to generate Beatles song using TF and Keras

##### First we call the libraries we need 

In [None]:
import pylyrics3 # web lyrics scraper
import time 
import json
import pandas as pd
import sys

import tensorflow as T

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

##### In order to get out training set we scrap pylyrics3 searching for all Beatles lyris:

In [None]:
#This extracts all the songs from the beatles and stores them as a dictionary in python 
btls = pylyrics3.get_artist_lyrics('beatles')

In [None]:
#We will convert the data to a dataframe save a copy in case the web site goes down eventually
btlsDF = pd.DataFrame(list(btls.items()), columns = ("Song","Lyrics"))
btlsDF.to_csv('beatles_lyrics.csv')

##### Let's have a look at the data

In [None]:
btlsDF.head()

In [None]:
btlsDF.tail()

In [None]:
# Lets show an entire song
pd.options.display.max_colwidth = 3000
btlsDF.Lyrics[:1]

##### We see that we have 264 songs where Song column refers to the song and Lyrics contain all song lyrics with line breaks. 

##### We can start then to prepare the data in a way that we can feed it to our LSTM net. We will concatenate all lyrics as a big string and will feed chunks of 100 characters and the character 101 will be out labelled output.

In [None]:
### We concatenate the lyrics and make them lowercase
raw_text = btlsDF['Lyrics'].str.cat().lower()

# We identify all unique characters appearing in the set
chars = sorted(list(set(raw_text)))

# We encode each character as integers
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [None]:
# What we have now
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

In [None]:
# We define the length of the input and initialise the train set
seq_length = 100
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length] # chunks of 100 chars
	seq_out = raw_text[i + seq_length] # one char expected output
    ### both have to be converted to integer for the net to be able to process them
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

In [None]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

##### The data is ready now, so we start building our net. This process with keras is extremely easy:
* Initialise model
* Add first LSTM layer (including input parameters) + dropout
* Add second LSTM layer + dropout
* Add output layer
* Compile

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

##### Before starting, as this training can take a lot of time (several days with my poor laptop!), we are going to define checkpoins that will take care of saving each epoch if the loss was improved. By doing this we can train the model y batches at our own pace and ensure we don't lose everything in case the computer shuts down accidentally.

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# Let the game begin...
model.fit(X, y, epochs=20, batch_size=56, callbacks=callbacks_list)

##### Once the training is complete we recover the weights with lower loss

In [None]:
# load the network weights
filename = "weights-improvement-12-1.2382-bigger.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
# We need to put back integers to characters
int_to_char = dict((i, c) for i, c in enumerate(chars))

### To make our own lyrics we give 100 chars from the set to the net and we start asking it to produce one character at a time

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]

# generate characters
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
for i in range(300):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")

##### If we have trained enough our model, it gives some interesting results. It is usually a bunch of gibberish but you can see the style straight away. It normally produces really funny results. I love this script :)