In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import Callback
import matplotlib.pyplot as plt
from IPython.core.display_functions import clear_output

# Intros

# Couplets

In [None]:
couplets_songs = os.listdir('data/Verse')
couplets_lyrics = []

couplets_files = [os.path.join("data/Verse", song) for song in couplets_songs]

if "data/Verse/.DS_Store" in couplets_files:
    couplets_files.remove("data/Verse/.DS_Store")
for file in couplets_files:
    with open(file, 'r') as f:
        for line in f.readlines():
            if len(" ".join(line.split())) > 0:
                couplets_lyrics.append(" ".join(line.split()))

In [None]:
print(couplets_lyrics)

In [None]:
couplets_lens = [len(line.split()) for line in couplets_lyrics]
couplets_lens.sort()
print(couplets_lens[0], couplets_lens[-1])

In [None]:
while '' in couplets_lyrics:
    couplets_lyrics.remove('')

In [None]:
couplets_lens = [len(line.split()) for line in couplets_lyrics]
couplets_lens.sort()
print(couplets_lens[0], couplets_lens[-1])

In [None]:
for line in couplets_lyrics:
    if len(line.split()) == 1:
        print(f"Line: {line}")

In [None]:
couplets_maxlen = max(couplets_lens)
padding_type = "pre"

In [None]:
couplet_tokenizer = Tokenizer(oov_token="<OOV>", filters='"#$%&()*+,/:;<=>?@[\\]^_`{|}~\t\n')

couplet_tokenizer.fit_on_texts(couplets_lyrics)
couplet_word_index = couplet_tokenizer.word_index
print(couplet_word_index)
couplet_input_sequences = []

for line in couplets_lyrics:
    tokenized_line = couplet_tokenizer.texts_to_sequences([line])[0]

    for token in range(1, len(tokenized_line)):
        n_gram_sequence = tokenized_line[:token+1]

        couplet_input_sequences.append(n_gram_sequence)

couplet_maxlen = max([len(x) for x in couplet_input_sequences])

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(couplet_input_sequences, maxlen=couplet_maxlen, padding='pre')

couplet_xs, couplet_ys = padded_sequences[:,:-1], tf.keras.utils.to_categorical(padded_sequences[:,-1], num_classes=len(couplet_word_index) + 1)

In [None]:
couplet_inputs = tf.keras.layers.Input(shape=(couplet_xs.shape[1],))

couplet_x = tf.keras.layers.Embedding(input_dim=len(couplet_word_index) + 1, output_dim=256)(couplet_inputs)

couplet_x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(couplet_x)
couplet_x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(couplet_x)
couplet_x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256))(couplet_x)
couplet_outputs = tf.keras.layers.Dense(len(couplet_word_index) + 1, activation='softmax')(couplet_x)


couplet_model = tf.keras.models.Model(inputs=couplet_inputs, outputs=couplet_outputs)
couplet_model.summary()

In [None]:
epochs = 100
couplet_loss = tf.keras.losses.CategoricalCrossentropy()
couplet_optimizer = tf.keras.optimizers.Adam()

couplet_model.compile(loss=couplet_loss, optimizer=couplet_optimizer, metrics=['accuracy'])

In [None]:
print(couplet_xs.shape)

In [None]:
couplet_history = couplet_model.fit(couplet_xs, couplet_ys, epochs=epochs)

In [None]:
plt.gcf()
plt.plot(range(100), couplet_history.history['loss'])
plt.show()

plt.gcf()
plt.plot(range(100), couplet_history.history['accuracy'])
plt.show()

In [None]:
couplet_lengths = []
if ".DS_Store" in couplets_songs:
    couplets_songs.remove(".DS_Store")
for file in couplets_songs:
    couplets_word_counter = 0
    with open(os.path.join("data/Verse", file), 'r') as f:
        for line in f.readlines():
            couplets_word_counter += len(line.split())
    couplet_lengths.append(couplets_word_counter)
    couplets_word_counter = 0

In [None]:
couplet_lengths = np.array(couplet_lengths)
print(couplet_lengths)
print(couplet_lengths.mean())

In [None]:
couplet_seed = ("")
for i in range(91):
    tokenized_seed = couplet_tokenizer.texts_to_sequences([couplet_seed])[0]
    padded_seed = tf.keras.preprocessing.sequence.pad_sequences([tokenized_seed], maxlen=couplet_maxlen-1, padding='pre')
    probs = couplet_model.predict([padded_seed])

    pred = np.argmax(probs)

    next_word = couplet_tokenizer.index_word[pred]
    couplet_seed+=" " + next_word

print(couplet_seed)

# Refrain