# Generating 90s Pop Lyrics at the Character level

## Goal
Generate 1 line of lyrics in the style of 90s Pop.

## Problem Formulation
X: Examples of a line of lyrics for the model to use (n_examples, max_length, n_characters)
Y: A generated sequence of characters that ends with <EOS> (n_examples, n_characters)
    
<EOS> will be a special character in the vocabulary which the model will use to know that it can stop predicting.

## Methodology
To accomplish this, we need:
1. Dataset: A corpus of 90s Pop lyrics
2. Vocabulary: A set of characters which will be used for generating lyrics
3. Model: A model which can encode the probability of the next character given a sequence of characters
4. Generate Lyrics: Use the model and an input to generate new lyrics

In [1]:
from datetime import datetime
from keras import backend as K
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Dense, Input, LSTM
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
import os
import pandas as pd
from random import sample
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# hyper parameters
batch_size = 100
epochs = 50
learning_rate = 0.01
n_training = 30000
training_ratio = 0.3

# logging name
variant = '0.7t-30000-100b'
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
log_dir = 'logs/{}-{}'.format(variant, timestamp)

In [3]:
# to use GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# verify that a gpu is listed
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

## Extract and Transform Raw Dataset

In [4]:
# load raw data file as a dataframe
raw_data = pd.read_csv('data/raw.csv')

In [5]:
# filter for only lyrics from the 1990s, of the pop genre, and not instrumentals
mask = (raw_data['year'] > 1989) & (raw_data['year'] < 2000) & (raw_data['genre'] == 'Pop') & (raw_data['lyrics'] != '[Instrumental]')
filtered_data = raw_data[mask]

In [6]:
# remove any that have null values
cleaned_data = filtered_data.dropna()

In [7]:
# trim all the extra data. We only want the lyrics
raw_lyrics = cleaned_data['lyrics']

In [8]:
# reindex the lyrics to make it easier to work with
reindexed_lyrics = raw_lyrics.reset_index(drop=True)

In [9]:
# lowercase the lyrics to make it easier to work with
formatted_lyrics = reindexed_lyrics[:].str.lower()
formatted_lyrics.head(10)

0    come they told me, pa rum pum pum pum\na new b...
1    over the ground lies a mantle, white\na heaven...
2    i just came back from a lovely trip along the ...
3    i'm dreaming of a white christmas\njust like t...
4    just hear those sleigh bells jingle-ing, ring-...
5    little rump shaker she can really shake and ba...
6    girl you want to sex me\ngirl, why don't you l...
7    oooh, tonight i want to turn the lights down l...
8    so you say he let you on, you'll never give yo...
9    something about you baby\nthat makes me wanna ...
Name: lyrics, dtype: object

In [10]:
# examine the number of song lyrics we have
n_formatted_lyrics = formatted_lyrics.shape[0]
print(n_formatted_lyrics)

964


In [11]:
# split each lyric on \n
# store song lyrics as a list of lines
# store those in lyrics
lyrics_lines = []

for i in range(n_formatted_lyrics):
    lyrics = formatted_lyrics[i].split('\n')
    lyrics_lines.append(lyrics)

In [12]:
# flatten the previous into a list of song lyrics lines
flattened_lyrics_lines = [line for song in lyrics_lines for line in song]

In [13]:
# examine the resulting number of song lyrics lines we have
print(len(flattened_lyrics_lines))
print(flattened_lyrics_lines[0])

35188
come they told me, pa rum pum pum pum


## Filter out non-english lyrics

In [14]:
# char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z']

In [15]:
english_lyrics_lines = []

for line in flattened_lyrics_lines:
    line_split = text_to_word_sequence(line, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
    char_split = list(" ".join(line_split))
    char_check = 0
    for char in char_split:
        if char not in char_set:
            char_check = 1
            
    if char_check == 0:
        english_lyrics_lines.append(char_split)

In [16]:
# examine the resulting number of song lyrics lines we have
print(len(english_lyrics_lines))
print(english_lyrics_lines[0])

33460
['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm']


## Extract the subset we are interested in

In [17]:
# grab a random amount of them for our examples
examples = sample(english_lyrics_lines, n_training)

print(len(examples))
print(examples[0])

1000
['t', 'o', ' ', 't', 'h', 'e', ' ', 'd', 'a', 'r', 'k', ' ', 'a', 'n', 'd', ' ', 'e', 'm', 'p', 't', 'y', ' ', 's', 'k', 'i', 'e', 's']


In [18]:
# process lyrics into lists of word indices
# also determine line with the greatest length
max_char_n = 0

for line in examples:
    char_n = len(line)
    if char_n > max_char_n:
        max_char_n = char_n

In [19]:
print(max_char_n)

69


In [20]:
# flatten chars
flat_chars = [item for sublist in examples for item in sublist]

# dedup list
chars = list(set(flat_chars))

# append our terminator
print(chars)

['l', 'y', 'v', 'r', 's', 'm', "'", 'e', 'c', 'b', 'f', 't', 'i', 'j', 'z', 'g', 'h', 'p', ' ', 'd', 'w', 'q', 'n', 'u', 'k', 'a', 'x', 'o']


In [21]:
# determine number of charecters in our set
n_chars = len(chars)
print(n_chars)

28


In [22]:
# create dictionarys
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)
print(char_to_ix)

{0: ' ', 1: "'", 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z'}
{' ': 0, "'": 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}


## Training and Validation Datasets

In [23]:
# create training input
X_training = np.zeros((n_training, max_char_n, n_chars), dtype='float32')
X_training.shape

(1000, 69, 28)

In [24]:
# fill input training set with word sequences, where words are one-hot encoded
for li, line in enumerate(examples[:n_training]):
    indices = []
    for ci, char in enumerate(line):
        index = char_to_ix[char]
        X_training[li][ci][index] = 1

In [25]:
# create training output
Y_training = X_training

## Validate Dataset

In [26]:
x_training_string = []
for woh in X_training[0]:
    max_idx = np.argmax(woh)
    x_training_string.append(ix_to_char[max_idx])
x_training_string_formatted = "".join(x_training_string)
print(x_training_string_formatted)

to the dark and empty skies                                          


In [27]:
y_training_string = []
for woh in Y_training[0]:
    max_idx = np.argmax(woh)
    y_training_string.append(ix_to_char[max_idx])
y_training_string_formatted = "".join(y_training_string)
print(y_training_string_formatted)

to the dark and empty skies                                          


## Model

In [28]:
model_input = Input(shape=(None, n_chars))
x = LSTM(128, return_sequences=True)(model_input)
x = Dense(n_chars, activation='softmax')(x)

In [29]:
model = Model(inputs=model_input, outputs=x)

optimizer = RMSprop(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [30]:
# set up callbacks
early = EarlyStopping(monitor='val_acc',
                      min_delta=0,
                      patience=10,
                      verbose=1,
                      mode='auto')

In [31]:
model.fit(X_training, 
          Y_training, 
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle=True,
          validation_split=training_ratio,
          callbacks=[early, TensorBoard(log_dir=log_dir)])

Train on 700 samples, validate on 300 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 00014: early stopping


<keras.callbacks.History at 0x7fbc6c106518>

## Make a prediction

In [32]:
new_sample = 'sweet dreams are made of these'

In [33]:
# convert new_sample to a sequence of one-hot encoded chars
line_split = text_to_word_sequence(new_sample, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
char_split = list(" ".join(line_split))
n_sample_chars = len(char_split)

sample = np.zeros((1, n_sample_chars, n_chars), dtype='float32')

for ci, char in enumerate(char_split):
    index = char_to_ix[char]
    sample[0][ci][index] = 1

In [34]:
prediction = model.predict(sample)

In [35]:
# take the max of each...
string_prediction = []
for p in prediction[0]:
    max_p = np.argmax(p)
    string_prediction.append(ix_to_char[max_p])

In [36]:
formatted_prediction = "".join(string_prediction)

In [37]:
print(formatted_prediction)

sweet dreams are made of these


## Generate a sequence from a sequence

In [38]:
x = sample

In [39]:
# take the max of each...
string_prediction = []
for p in x[0]:
    max_p = np.argmax(p)
    string_prediction.append(ix_to_char[max_p])
formatted_prediction = "".join(string_prediction)
print(formatted_prediction)

sweet dreams are made of these


In [40]:
for i in range(100):
    prediction = model.predict(x, verbose=0)
    x = np.zeros((1, prediction.shape[1] + 1, n_chars), dtype='float32')
    x[0][:prediction.shape[1]] = prediction[0]

In [41]:
# take the max of each...
for p in x[0]:
    max_p = np.argmax(p)
    next_char = ix_to_char[max_p]
    sys.stdout.write(next_char)
    sys.stdout.flush()

sweet dreams are made of these   o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o   