# Lyrics Generation
#### a NLP Project for ENSF 519

## Alternative Method Using Reduced Normalization for Comparison

For comparison purposes with the main system

#### Dependencies

In [1]:
'''
code based on the keras documentation and examples.
'''

# from __future__ import print_function
import argparse
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import random
import sys
import io
from tensorflow.python.client import device_lib

# data manip
import numpy as np
import pandas as pd

# callbacks
from keras.callbacks import LambdaCallback
from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


#### Constants for Configuration

In [4]:
n_words_to_generate=100

#### helper functions

In [16]:
def parse_args():
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--seed', type=str, default='', help='seed string: sets to generation mode and uses as seed')
#     parser.add_argument("--n_epochs", type=int, default=5, help="number of epochs")
#     parser.add_argument("--maxlen", type=int, default=4, help="maxlen of sentence fragment")
#     parser.add_argument("--use_lowercase", type=bool, default=False, help="set corpus to lowercase")
#     parser.add_argument("--step", type=int, default=2, help="step")
#     parser.add_argument("--epochs", type=int, default=5, help="number of epochs")
    args = {
        'seed': '',
        'n_epochs': 5,
        'maxlen': 3,
        'use_lowercase': False,
        'step': 2
    }
    return args

def sample(preds, temperature=1.0):
    # from keras lstm example code
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def train_model(args):

    def on_epoch_end(epoch, _):
        # every epoch, print some text based on the current model
        print('\n\n')
        print('----- Generating text after Epoch: %d' % epoch)

        # randomly obtain seed string from corpus
        start_index = random.randint(0, len(text) - args['maxlen'] - 1)
        for temperature in [0.2, 0.5, 1.0, 1.2]:
            print('----- temperature:', temperature)

            word_list = text[start_index : start_index + args['maxlen']]
            sentence = ' '.join(word_list)

            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(sentence)

            for i in range(n_words_to_generate):
                x_pred = np.zeros((1, args['maxlen'], len(words)))
                # t is the index of word in current sentence
                for t, word in enumerate(word_list[- args['maxlen']:]):
                    x_pred[0, t, word_indices[word]] = 1.

                # [0] because of shape of the output mirrors that of the input
                preds = model.predict(x_pred, verbose=0)[0]
                next_word = indices_word[sample(preds, temperature)]

                sentence = sentence[1:] + ' ' + next_word
                word_list.append(next_word)

                sys.stdout.write(' ' + next_word)
                sys.stdout.flush()
            print()

    print('keras is using the gpu if a gpu is listed below')
    print(device_lib.list_local_devices())

    # build the model: a single LSTM
    print('Build model...')
    model = Sequential()
    model.add(LSTM(128, input_shape=(args['maxlen'], len(words))))
    model.add(Dense(len(words), activation='softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
    checkpointer = ModelCheckpoint(filepath='data/nathan_model_{epoch:02d}.hdf5', verbose=1, period=10)
    csv_logger = CSVLogger('training.log')

    model.fit(x, y, batch_size=128, epochs=args['n_epochs'], callbacks=[print_callback, csv_logger, checkpointer])

def generate_lyrics(args):
    model = load_model('data/nathan_model.hdf5')

    sentence = args['seed'].lower()
    word_list = sentence.split()

    for i in range(n_words_to_generate):
        x_pred = np.zeros((1, args['maxlen'], len(words)))
        # t is the index of word in current sentence
        for t, word in enumerate(word_list[- args['maxlen']:]):
            x_pred[0, t, word_indices[word]] = 1.

        # [0] because of shape of the output mirrors that of the input
        preds = model.predict(x_pred, verbose=0)[0]
        next_word = indices_word[sample(preds, 0.2)]

        sentence = sentence[1:] + ' ' + next_word
        word_list.append(next_word)

    sys.stdout.write(' '.join(word_list))
    sys.stdout.flush()

#### Main

In [17]:
args = parse_args()

songs_csv = pd.read_csv('data/songs_all.csv', encoding='iso-8859-1')

text = '\n'.join([''.join(song) for song in songs_csv.lyrics.values])
if args['use_lowercase']:
    text = text.lower()

words = sorted(set(text.split()))
text = text.split()
print('total words:', len(words))

word_indices = dict((w, i) for i, w in enumerate(words))
indices_word = dict((i, w) for i, w in enumerate(words))

# cut the text into semi-overlapping sequences of maxlen words each
sentences = []
next_words = []
for i in range(0, len(text) - args['maxlen'], args['step']):
    sentences.append(text[i: i + args['maxlen']])
    next_words.append(text[i + args['maxlen']])
print('num sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), args['maxlen'], len(words)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        x[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1

if args['seed'] != '':
    generate_lyrics(args)
else:
    train_model(args)

total words: 15485
num sequences: 105794
Vectorization...
keras is using the gpu if a gpu is listed below
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10061207584016202678
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 1872371655555129886
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7682143661766980832
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6031110964
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6195714978289159646
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1"
]
Build model...
Epoch 1/5



----- Generating text after Epoch: 0
----- temperature: 0.2
----- Generating with seed: "me to my"
me to my baby I don't know what I don't want to go home I don't

KeyboardInterrupt: 