In [1]:
import numpy as np
import pandas as pd
from string import ascii_lowercase
import re
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense

# from __future__ import print_function
# from keras.callbacks import ModelCheckpoint
# from keras.models import Sequential
# from keras.layers import Dense, Activation, Dropout
# from keras.layers import LSTM
# from keras.optimizers import RMSprop, Adam
# from keras.utils.data_utils import get_file

# import random
# import sys
# import io
# 
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(0)

Using TensorFlow backend.


In [6]:
artist = 'Katy Perry'
songs = pd.read_csv('Data/' + artist.lower().replace(' ', '_') + '_dataset.csv')

In [7]:
good_chars = list(ascii_lowercase)
good_chars.append(" ")
good_chars.append(".")
good_chars.append(",")
good_chars.append("'")

char_to_int = dict((c, i) for i, c in enumerate(good_chars))
int_to_char = dict((i, c) for i, c in enumerate(good_chars))

len(char_to_int)

30

### Extract relevant data

Uses the sliding door technique to create new dataframe. All lyrics are split into multiple segments of seed_len (40) characters each. These segments are saved into the new dataframe as a seed. The output is the next character in the lyrics.

For example, assume seed_len is 20 and lyric is "look at her face, its a wonderful face":
* "look at her face, it", 115 (unicode of "s" is 115)
* "ook at her face, its", 32 (unicode of " " is 32)
* "ok at her face, its ", 97 (unicode of "a" is 97)
* "k at her face, its a", 32 (unicode of " " is 32)
* " at her face, its a", 119 (unicode of "w" is 119)
* etc

In [8]:
text = ''

for i, row in songs['lyrics'].iteritems():
    clean = str(row).lower().replace(' ', '\n')
    text = text + " ".join(re.findall(r"[a-z']+", clean))

len(text)

245242

In [9]:
seed_len = 100
step = 1
seeds = []
outputs = []

for i in range(0, len(text) - seed_len, step):
    seeds.append(text[i: i + seed_len])
    outputs.append(text[i + seed_len])
    
len(seeds)

245142

In [10]:
# from sklearn.utils import shuffle
seeds, outputs = shuffle(seeds, outputs, random_state=0)

print('Seed: ' + seeds[0])
print('Output: ' + outputs[0])

Seed:  you come for me no not today you're calculated i got your number 'cause you're a joker and i'm a co
Output: u


In [11]:
x = np.zeros((len(seeds), seed_len, len(good_chars)), dtype=np.bool)
y = np.zeros((len(seeds), len(good_chars)), dtype=np.bool)

for i, seed in enumerate(seeds):
    for t, char in enumerate(seed):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[outputs[i]]] = 1

## Building the AI

### Creating the model
Uses a Long-Short-Term-Memory (LSTM) network to predict the output from the seed.

In [17]:
# build the model: a single LSTM
# model = Sequential()
# model.add(LSTM(128, input_shape=(seed_len, len(good_chars))))
# model.add(Dense(len(good_chars)))
# model.add(Activation('softmax'))

# model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01), metrics=['accuracy'])

model = Sequential()
model.add(LSTM(256, input_shape=(seed_len, len(good_chars))))
model.add(Dropout(0.2))
model.add(Dense(len(good_chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# model = Sequential()
# model.add(LSTM(256, input_shape=(seed_len, len(good_chars)), return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(256))
# model.add(Dropout(0.2))
# model.add(Dense(len(good_chars), activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 256)               293888    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                7710      
Total params: 301,598
Trainable params: 301,598
Non-trainable params: 0
_________________________________________________________________


In [19]:
def read_prediction(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [20]:
train_len = int(x.shape[0] * 0.8)
test_len = int(x.shape[0] * 0.1)

x_train = x[:train_len, :, :]
y_train = y[:train_len, :]

x_val = x[train_len:train_len + test_len, :, :]
y_val = y[train_len:train_len + test_len, :]

x_test = x[train_len + test_len:, :, :]
y_test = y[train_len + test_len:, :]

In [21]:
# history = model.fit(x_train, y_train, batch_size=128, epochs=5, validation_data = (x_val, y_val))
history = model.fit(x_train, y_train, epochs=20, batch_size=128, validation_data = (x_val, y_val))
# history = model.fit(x_train, y_train, epochs=50, batch_size=64, validation_data = (x_val, y_va;))

Train on 196113 samples, validate on 24514 samples
Epoch 1/20

KeyboardInterrupt: 

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
def generate_output(size=400):
    generated = ''
    usr_input = input("Input a phrase to use for generation: ")

    seed = ('{0:0>' + str(seed_len) + '}').format(usr_input).lower()
    generated += usr_input 

    print("\n\nHere is your song: \n\n") 
    print(usr_input, end='')
    
    for i in range(size):
        x_pred = np.zeros((1, seed_len, len(good_chars)))

        for t, char in enumerate(seed):
            if char != '0':
                x_pred[0, t, char_to_int[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        output_index = read_prediction(preds, temperature = 0.2)
        output = int_to_char[output_index]

        generated += output
        seed = seed[1:] + output

        print(output, end='')

        if output == '\n':
            continue

In [None]:
generate_output()

In [15]:
filename = 'Models/model_' + artist.lower().replace(' ', '_')

model_json = model.to_json()
with open(filename + '.json', 'w') as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(filename + '.h5')

In [20]:
results = model.evaluate(x_test, y_test)



In [21]:
results

[0.9754309318237145, 0.7393024563789368]