In [1]:
import pandas as pd
import numpy as np
import json
import os
import requests
import sys

from gensim.utils import simple_preprocess

import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [5]:
path = './Lyrics_JSON'

In [6]:
# Aggregating separate JSON lyric files

def gather_data(path_to_data):
    data = []
    
    for f in os.listdir(path):
        if os.path.isdir(f) == False:
            if f[-4:] == 'json':
                with open(os.path.join(path, f)) as t:
                    #for lyrics in t['Lyrics']:
                    text = t.read().strip('\n')
                    data.append(str(text))
                    
    return data

In [7]:
lyrics = gather_data(path)

In [13]:
# def tokenize(text):
#     return [token for token in simple_preprocess(text)]

In [3]:
# with open('lyrics_aggregated.txt', 'w') as filehandle:
#     filehandle.writelines('%s\n' % line for line in lyrics)

In [12]:
text = (open('lyrics_aggregated.txt').read())

In [14]:
text = text.lower()

In [15]:
# Simple pass at cleaning text

# text = text.replace('\n', '')
text = text.replace('{', '')
text = text.replace('}', '')
# text = text.replace("\", '')
text = text.replace('[', '')
text = text.replace(']', '')
text = text.replace('lyrics', '')
text = text.replace('title', '')
text = text.replace('"', '')
text = text.replace('genius', '')

In [17]:
# Creating character/word mappings
# All unique characters/words are mapped to a number

characters = sorted(list(set(text)))
chars_to_int = dict((c, i) for i, c in enumerate(characters))

In [18]:
n_chars = len(text)
n_vocab = len(characters)
print(f'total characters: {n_chars}, total vocab: {n_vocab}')

total characters: 46088, total vocab: 52


In [19]:
# Training and target array for LSTM model

X = []
y = []

seq_length = 100

for i in range(0, n_chars - seq_length, 1):
    sequence = text[i:i + seq_length]
    label = text[i + seq_length]
    X.append([chars_to_int[char] for char in sequence])
    y.append(chars_to_int[label])
    
n_patterns = len(X)
print(f'total patterns: {n_patterns}')

total patterns: 45988


In [20]:
# Modifying array shapes for LSTM, transform y into one-hot encoded

X_modified = np.reshape(X, (n_patterns, seq_length, 1))

#normalize
X_modified = X_modified / float(n_vocab)
y_modified = np_utils.to_categorical(y)

In [22]:
# Sequential model with two LSTM layers with 400 units each
# Dropoout layer to check for over-fitting

model = Sequential()
model.add(LSTM(256, input_shape=(X_modified.shape[1], X_modified.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [24]:
%%time

# Baseline model

model.fit(X_modified, y_modified, epochs=3, batch_size=128)

# model.save_weights('text_generator_400_0.2_400_0.2_baseline.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 1h 9min 50s, sys: 6min 54s, total: 1h 16min 45s
Wall time: 22min 9s


<keras.callbacks.History at 0x1a45311a50>

In [25]:
int_to_char = dict((i, c) for i, c in enumerate(characters))

In [47]:
# Pick a random seed from text data

start = np.random.randint(0, len(X) - 1)
pattern = X[start]
print('Seed:')
print("\"", ''.join([int_to_char[value] for value in pattern])), "\""

# Generate characters
for i in range(500):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    
    prediction = model.predict(x, verbose=0)
    index = np.random.choice(len(prediction[0]), p=prediction[0])
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

Seed:
"          nobody's stopping you, baby, from making it too,
            one glimpse'll show you now, b
 s wevb 
e             so
sele'g niptelnnslt,
donisomah ,ofgk


           sdtlyndwer t sf tn nout  mueai 
            hfe ideann a knvo,w uh,shd,mb da'fanentpta
c  
           a lfsabut wyn serle    t         ad atur liiobnyu,o akot  
          i,e              ,oi bspiihtrf tt,u,mll1iaaoo jo               m a givtewt  hedn ynk i lyrn  w wdloy              ahuwwht loe ct,ia1ait,llahi c              i r sev bftelthhpihl

            thatoethwk  let,kooeqss, t r ldi l yeue
wnih              y,i'u

In [33]:
prediction

array([[4.3925453e-02, 5.6918347e-01, 1.7135640e-03, 5.6058483e-04,
        8.2081935e-04, 3.3069386e-03, 1.6618368e-03, 6.8595693e-03,
        6.4842305e-03, 1.1540813e-03, 5.4565761e-03, 6.3753757e-03,
        2.6482113e-03, 1.6103856e-03, 9.3915942e-04, 1.0843370e-03,
        1.8812113e-03, 1.2088580e-02, 1.1178488e-03, 1.9463705e-03,
        5.9105419e-03, 1.2871384e-02, 4.2035472e-02, 4.2130803e-03,
        1.9000366e-02, 2.1447646e-03, 1.0639472e-03, 3.9623775e-02,
        1.1519723e-02, 2.4152214e-03, 1.0786365e-03, 1.0191180e-02,
        4.3848651e-03, 3.3273061e-03, 2.6339002e-02, 6.8344916e-03,
        7.9470209e-04, 3.4503844e-02, 1.5191199e-02, 3.2295494e-03,
        6.8122791e-03, 5.5340368e-02, 2.7372271e-03, 9.4096269e-04,
        3.7023611e-03, 6.6317222e-04, 5.1501021e-03, 2.0916974e-03,
        2.3952436e-03, 1.0559686e-03, 6.6319597e-04, 9.5597759e-04]],
      dtype=float32)