In [82]:
import numpy as np
import pandas as pd
import random
import sys

import tensorflow
import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, RNN
from keras.utils import np_utils

In [6]:
df = pd.read_csv('/Users/mattmastin/Desktop/the-billie-project-ds/data/scraped_lyrics.csv')

In [32]:
df

Unnamed: 0,artist,title,song_id,lyrics
0,Billie Eilish,COPYCAT,Billie-eilish-copycat,"Don't be cautious, don't be kind, You committ..."
1,Billie Eilish,​idontwannabeyouanymore,Billie-eilish-idontwannabeyouanymore,"Don't be that way, 'Fall apart twice a day', ..."
2,Billie Eilish,​my boy,Billie-eilish-my-boy,"'(Three and four and)', 'Ba-ba-da, ba-ba-da-b..."
3,Billie Eilish,​watch,Billie-eilish-watch,"'Lips meet teeth and tongue', 'My heart skips..."
4,Billie Eilish,​​party favor,Billie-eilish-party-favor,"'Hey, leave a message', 'Hey, call me back ..."
...,...,...,...,...
122,Billy Joel,Movin' Out (Anthony's Song),Billy-joel-movin-out-anthonys-song,"'Anthony works in the grocery store', Savin' ..."
123,Billy Joel,She's Always a Woman,Billy-joel-shes-always-a-woman,"'She can kill with a smile, she can wound with..."
124,Billy Joel,Scenes from an Italian Restaurant,Billy-joel-scenes-from-an-italian-restaurant,"'A bottle of white, a bottle of red', 'Perhaps..."
125,Billy Joel,Just the Way You Are,Billy-joel-just-the-way-you-are,"Don't go changing to try and please me, 'You ..."


In [41]:
with open('only_lyrics.txt', 'w') as f:
    f.write(str(lyrics))

In [80]:
text = (open('only_lyrics.txt').read().lower())
print(f'Corpus length: {len(text)}')

Corpus length: 140756


In [71]:
%%time
# Vectorizing sequences of characters

maxlen = 60
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
    
print(f'Number of sequences: {len(sentences)}')

chars = sorted(list(set(text)))
print(f'Unique chracters: {len(chars)}')
char_indices = dict((char, chars.index(char)) for char in chars)

print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 46899
Unique chracters: 55
Vectorization...
CPU times: user 942 ms, sys: 62.4 ms, total: 1 s
Wall time: 1.02 s


In [81]:
# print(text)

In [88]:
# Building up LSTM model

model = Sequential()
model.add(LSTM(500, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(500, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(500, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [89]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 60, 500)           1112000   
_________________________________________________________________
dropout_8 (Dropout)          (None, 60, 500)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 60, 500)           2002000   
_________________________________________________________________
dropout_9 (Dropout)          (None, 60, 500)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 60, 500)           2002000   
_________________________________________________________________
dropout_10 (Dropout)         (None, 60, 500)           0         
_________________________________________________________________
dense_3 (Dense)              (None, 60, 55)            27555     
Total para

In [None]:
"""
DON'T RUN THIS LOCALLY!!!!
"""

# model.fit(x,y, epochs=100, batch_size=32)
model.save_weights('lyric_generator_Feb1_2020.h5')

In [90]:
# Reweighting probability distribution to introduce extra randomness into text generation
# Prior iterations performed well with a 'temperature' ~0.6 to balance creativity and coherence

def sample(preds, temperature=0.6):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# Generatic sample lyrics to check model efficacy

for song in range(1, 60):
    print(f'Grammy Award Winning Song #{song}')
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print('-------- Generating with seed lyrics"' + generated_text + '"')
    
    # Number of characters assuming an average song length of 200-300 words
    for i in range(800):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
            
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        
        generated_text += next_char
        generated_text = generated_text[1:]
        
        sys.std.out.write(next_char)