# Generating 90s Pop Lyrics at the Character level

## Goal
Generate 1 line of lyrics in the style of 90s Pop.

## Problem Formulation
X: Examples of a line of lyrics for the model to use (n_examples, max_length, n_characters)
Y: A generated sequence of characters that ends with <EOS> (n_examples, n_characters)
    
<EOS> will be a special character in the vocabulary which the model will use to know that it can stop predicting.

## Methodology
To accomplish this, we need:
1. Dataset: A corpus of 90s Pop lyrics
2. Vocabulary: A set of characters which will be used for generating lyrics
3. Model: A model which can encode the probability of the next character given a sequence of characters
4. Generate Lyrics: Use the model and an input to generate new lyrics

In [1]:
from datetime import datetime
from keras import backend as K
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Dense, Input, LSTM
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
import os
import pandas as pd
from random import sample
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# hyper parameters
activations = 128
batch_size = 50
epochs = 50
learning_rate = 0.01
max_char_n = 40
n_training = 10000
training_ratio = 0.3

# logging name
variant = 'single_output'
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
log_dir = 'logs/{}-{}'.format(variant, timestamp)

In [3]:
# to use GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# verify that a gpu is listed
K.tensorflow_backend._get_available_gpus()

[]

## Extract and Transform Raw Dataset

In [4]:
# load raw data file as a dataframe
raw_data = pd.read_csv('data/raw.csv')

In [5]:
# filter for only lyrics from the 1990s, of the pop genre, and not instrumentals
mask = (raw_data['year'] > 1989) & (raw_data['year'] < 2000) & (raw_data['genre'] == 'Pop') & (raw_data['lyrics'] != '[Instrumental]')
filtered_data = raw_data[mask]

In [6]:
# remove any that have null values
cleaned_data = filtered_data.dropna()

In [7]:
# trim all the extra data. We only want the lyrics
raw_lyrics = cleaned_data['lyrics']

In [8]:
# reindex the lyrics to make it easier to work with
reindexed_lyrics = raw_lyrics.reset_index(drop=True)

In [9]:
# lowercase the lyrics to make it easier to work with
formatted_lyrics = reindexed_lyrics[:].str.lower()
formatted_lyrics.head(10)

0    come they told me, pa rum pum pum pum\na new b...
1    over the ground lies a mantle, white\na heaven...
2    i just came back from a lovely trip along the ...
3    i'm dreaming of a white christmas\njust like t...
4    just hear those sleigh bells jingle-ing, ring-...
5    little rump shaker she can really shake and ba...
6    girl you want to sex me\ngirl, why don't you l...
7    oooh, tonight i want to turn the lights down l...
8    so you say he let you on, you'll never give yo...
9    something about you baby\nthat makes me wanna ...
Name: lyrics, dtype: object

In [10]:
# examine the number of song lyrics we have
n_formatted_lyrics = formatted_lyrics.shape[0]
print(n_formatted_lyrics)

964


## Filter out non-english lyrics

In [11]:
# char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '\n']

In [12]:
english_lyrics_lines = []

for line in formatted_lyrics:
    line_split = text_to_word_sequence(line, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
    char_split = list(" ".join(line_split))
    char_check = 0
    for char in char_split:
        if char not in char_set:
            char_check = 1
            
    if char_check == 0:
        english_lyrics_lines.append(char_split)

In [13]:
# examine the resulting number of song lyrics lines we have
print(len(english_lyrics_lines))
print(english_lyrics_lines[0])

782
['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'a', ' ', 'n', 'e', 'w', ' ', 'b', 'o', 'r', 'n', ' ', 'k', 'i', 'n', 'g', ' ', 't', 'o', ' ', 's', 'e', 'e', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'o', 'u', 'r', ' ', 'f', 'i', 'n', 'e', 's', 't', ' ', 'g', 'i', 'f', 't', 's', ' ', 'w', 'e', ' ', 'b', 'r', 'i', 'n', 'g', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 't', 'o', ' ', 'l', 'a', 'y', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 't', 'h', 'e', ' ', 'k', 'i', 'n', 'g', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', '\n', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '

In [14]:
# flatten the previous into a list of song lyrics lines
flattened_lyrics = [line for song in english_lyrics_lines for line in song]

In [15]:
# examine the resulting number of song lyrics lines we have
print(len(flattened_lyrics))
print(flattened_lyrics[0:100])

837528
['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'a', ' ', 'n', 'e', 'w', ' ', 'b', 'o', 'r', 'n', ' ', 'k', 'i', 'n', 'g', ' ', 't', 'o', ' ', 's', 'e', 'e', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'o', 'u', 'r', ' ', 'f', 'i', 'n', 'e', 's', 't', ' ', 'g', 'i', 'f', 't', 's', ' ', 'w', 'e', ' ', 'b']


## Extract the subset we are interested in

In [16]:
# generate n_training example of max_char_n length
examples = []
start_index = 0

for i in range(n_training):
    end_index = start_index + max_char_n
    example = flattened_lyrics[start_index:end_index]
    start_index = end_index
    examples.append(example)

print(len(examples))
print(examples[0])

1000
['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ' ', 'p', 'a']


In [17]:
# determine number of charecters in our set
n_chars = len(char_set)
print(n_chars)

30


In [18]:
# create dictionarys
char_to_ix = { ch:i for i,ch in enumerate(sorted(char_set)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(char_set)) }
print(ix_to_char)
print(char_to_ix)

{0: '\n', 1: ' ', 2: "'", 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'x', 28: 'y', 29: 'z'}
{'\n': 0, ' ': 1, "'": 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 27, 'y': 28, 'z': 29}


## Training and Validation Datasets

In [19]:
# create training input
X_training = np.zeros((n_training, max_char_n-1, n_chars), dtype='float32')
X_training.shape

(1000, 19, 30)

In [20]:
# fill input training set with word sequences, where words are one-hot encoded
for li, line in enumerate(examples):
    for ci, char in enumerate(line[:-1]):
        index = char_to_ix[char]
        X_training[li][ci][index] = 1

In [21]:
# create training input
Y_training = np.zeros((n_training, n_chars), dtype='float32')
Y_training.shape

(1000, 30)

In [22]:
# create training output
for li, line in enumerate(examples):
    char = line[-1]
    index = char_to_ix[char]
    Y_training[li][index] = 1

## Validate Dataset

In [23]:
x_training_string = []
x_example = X_training[0] 
for woh in x_example:
    max_idx = np.argmax(woh)
    x_training_string.append(ix_to_char[max_idx])
x_training_string_formatted = "".join(x_training_string)
print("X_training shape: {}".format(x_example.shape))
print("X_training example one-hot: {}".format(x_example))
print("X_training example: {}".format(x_training_string_formatted))

X_training shape: (19, 30)
X_training example one-hot: [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.

In [24]:
y_example = Y_training[0]
index = np.argmax(y_example)
char = ix_to_char[index]
print("Y_training shape: {}".format(y_example.shape))
print("Y_training example one-hot: {}".format(y_example))
print("Y_training example: {}".format(char))

Y_training shape: (30,)
Y_training example one-hot: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
Y_training example: a


## Model

In [25]:
model_input = Input(shape=(None, n_chars))
x = LSTM(activations)(model_input)
x = Dense(n_chars, activation='softmax')(x)

In [26]:
model = Model(inputs=model_input, outputs=x)

optimizer = RMSprop(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [27]:
# set up callbacks
early = EarlyStopping(monitor='val_acc',
                      min_delta=0,
                      patience=10,
                      verbose=1,
                      mode='auto')

In [28]:
model.fit(X_training, 
          Y_training, 
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle=True,
          validation_split=training_ratio,
          callbacks=[early, TensorBoard(log_dir=log_dir)])

Train on 700 samples, validate on 300 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 00019: early stopping


<keras.callbacks.History at 0x7f5227561160>

## Make a prediction

In [29]:
new_sample = 'sweet dreams are made of thes'

In [30]:
# convert new_sample to a sequence of one-hot encoded chars
line_split = text_to_word_sequence(new_sample, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
char_split = list(" ".join(line_split))
n_sample_chars = len(char_split)

sample = np.zeros((1, n_sample_chars, n_chars), dtype='float32')

for ci, char in enumerate(char_split):
    index = char_to_ix[char]
    sample[0][ci][index] = 1

In [31]:
prediction = model.predict(sample)

In [32]:
# take the max of each...
index = np.argmax(prediction[0])
char = ix_to_char[index]

In [33]:
print(char)

 


## Generate a sequence from a sequence

In [34]:
x = sample
x.shape

(1, 29, 30)

In [35]:
# take the max of each...
string_prediction = []
for p in x[0]:
    max_p = np.argmax(p)
    string_prediction.append(ix_to_char[max_p])
formatted_prediction = "".join(string_prediction)
print(formatted_prediction)

sweet dreams are made of thes


In [36]:
def sample_predictions(preds, temperature=0.5):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [37]:
sys.stdout.write(formatted_prediction)

for i in range(100):
    prediction = model.predict(x, verbose=0)[0]
    # sample the predictions
    next_index = sample_predictions(prediction)
    next_char = ix_to_char[next_index]
    string_prediction.append(next_char)
    x[0][:-1] = x[0][1:]
    x[0][-1] = prediction
    sys.stdout.write(next_char)
    sys.stdout.flush()

sweet dreams are made of thes se lint no t ne hode nown we
t nt i wl woni te towe leriole aa cow  ne ttto
b ne oonn lo iowt tose 