In [15]:
import re
import os
import random

import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop

In [1]:
path = 'data'
data = ''

# opening each file and appending to data
# files = os.listdir(path)
# for file in files:
#     if os.path.isfile(os.path.join(path, file)):
#         file_content = open(os.path.join(path, file), 'r', encoding='utf-8').read()
#         data += file_content

files = ['bieber', 'bruno-mars', 'drake', 'rihanna', 'adele']
for file in files:
    if os.path.isfile(os.path.join(path, file + '.txt')):
        file_content = open(os.path.join(path, file + '.txt'), 'r', encoding='utf-8').read()
        data += file_content

In [3]:
print('Length of corpus:', len(data))
print(data[:200])

Length of corpus: 711225
What do you mean?
Oh, oh, oh
When you sometimes say yes
But you sometimes say no
What do you mean?
Hey, yeah
When you don't want me to move
But you tell me to go
What do you mean?
Oh
What do you mean?


In [4]:
print('Number of unique characters before:', len(set(data)))

# Replace all non ascii characters in data with ''
data = re.sub(r'[^\x00-\x7F]', r'', data)
print('Number of unique characters after:', len(set(data)))

Number of unique characters before: 90
Number of unique characters after: 82


In [5]:
chars = sorted(set(data))
print('Total chars:', len(chars))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Total chars: 82


In [6]:
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(data) - maxlen, step):
    sentences.append(data[i: i + maxlen])
    next_chars.append(data[i + maxlen])

print('Number of sequences:', len(sentences))

x = np.zeros((len(sentences), maxlen, len(chars)), dtype = np.bool)
y = np.zeros((len(sentences), len(chars)), dtype = np.bool)
for i, sent in enumerate(sentences):
    for t, char in enumerate(sent):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 236991


In [17]:
model = Sequential([
    InputLayer(input_shape = (maxlen, len(chars))),
    LSTM(units = 128, activation = 'tanh'),
    Dense(units = len(chars), activation = 'softmax')
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 128)               108032    
_________________________________________________________________
dense_2 (Dense)              (None, 82)                10578     
Total params: 118,610
Trainable params: 118,610
Non-trainable params: 0
_________________________________________________________________


In [18]:
optimizer = RMSprop(learning_rate = 1e-3, decay = 1e-5)
model.compile(loss = "categorical_crossentropy", optimizer = optimizer)

In [19]:
import tensorflow as tf
len(tf.config.list_physical_devices('GPU'))

1

In [48]:
epochs = 10
batch_size = 128

model.fit(x, y, batch_size = batch_size, epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x17674963ee0>

In [21]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [49]:
start_index = random.randint(0, len(data) - maxlen - 1)

generated = ""
sentence = data[start_index : start_index + maxlen]
print('Generating with seed: "' + sentence + '"')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.0
        
    preds = model.predict(x_pred)[0]

    next_index = sample(preds)
    next_char = indices_char[next_index]
    sentence = sentence[1:] + next_char
    generated += next_char

print(generated)

Generating with seed: "me by the hand while we do what lovers d"
o
Sad it who styOn and searss in my own that it
 reah shot of how Amorias I'm a guing hon it time, fuck up a yJ Lete ginl, I'm me inger Best with you're right in the sad out ot ever, staring say, ham's much wonna have to fect
Let me and werknwas in ferentless
Trong usout
This to tol me
I know where you astan
Cay you keep lostoping up and piace, yno thay myself she loselive me
I sun the stude, has 


In [50]:
model.save('model_100')



INFO:tensorflow:Assets written to: model_100\assets


INFO:tensorflow:Assets written to: model_100\assets
