In [1]:
# Traininng a Neural Network to generate new Leonard Cohen ❤ Lyrics using Recurrent Neural Networks.
# Uses Long Short term Memory (LSTM) to predict the next probable word based on previous trends and vice versa.
#we use a bidirectional LSTM on the training corpus to achieve required results.
#author = Srinjana Pathak

SyntaxError: ignored

In [2]:
#IMPORTING NECESSARY PACKAGES

import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [3]:
#READING THE INPUT DATASET

data = open('leonard_cohen.txt').read()
corpus = data.lower().split("\n")

In [4]:
#PREPROCESSING

tokenizer = Tokenizer()

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1   #adding 1 for oov token

print(tokenizer.word_index)
print(total_words)


{'the': 1, 'i': 2, 'you': 3, 'and': 4, 'to': 5, 'a': 6, 'of': 7, 'me': 8, 'your': 9, 'in': 10, 'my': 11, 'that': 12, 'it': 13, 'for': 14, 'was': 15, 'on': 16, 'is': 17, 'but': 18, 'love': 19, "i'm": 20, "it's": 21, 'like': 22, 'with': 23, 'all': 24, 'they': 25, 'be': 26, 'he': 27, 'knows': 28, 'there': 29, 'this': 30, 'when': 31, 'no': 32, 'lover': 33, 'everybody': 34, 'if': 35, 'who': 36, 'so': 37, 'not': 38, 'we': 39, 'are': 40, "don't": 41, 'by': 42, 'know': 43, 'what': 44, 'just': 45, 'now': 46, 'from': 47, 'her': 48, 'will': 49, 'one': 50, 'come': 51, 'hallelujah': 52, 'time': 53, 'dance': 54, 'need': 55, 'got': 56, 'do': 57, 'said': 58, 'through': 59, 'want': 60, 'have': 61, 'see': 62, 'were': 63, 'oh': 64, 'night': 65, 'go': 66, 'then': 67, 'back': 68, 'never': 69, 'say': 70, 'or': 71, 'she': 72, 'his': 73, 'where': 74, 'heart': 75, "you're": 76, "i've": 77, 'tell': 78, 'take': 79, "there's": 80, "that's": 81, 'again': 82, 'up': 83, 'life': 84, 'been': 85, "can't": 86, 'repent':

In [5]:
#CREATING TRAINING DATA

input_sequences = []

for line in corpus:
  
    #creating list of tokens for every line in corpus line by line

    token_list = tokenizer.texts_to_sequences([line])[0]

    #generating n-grams ("you see these, this is what comes next" word by word)

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
    

In [6]:
#PADDING SEQUENCES
max_sequence_len = max([len(x) for x in input_sequences]) 
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre')) 

In [7]:
#ADDING FEATURES AND LABELS.

#in the padded sequences everything but the last value is an x and the last value foms the label y.
#E.g. [0 0 0 0 0 3 4 5]  {5 is a label and everything else is under x(feature)}

xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

#converts class vector(labels) to a binary class matrix

ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

In [None]:
#TRAINING THE NEURAL NETWORK
model = Sequential()
model.add(Embedding(total_words, 240, input_length = max_sequence_len - 1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation = 'softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer= adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs = 100, verbose=1)

Epoch 1/100

In [None]:
#Checking Accuracy (HOUSE-KEEPING)

import matplotlib as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.title("Accuracy Index")
    plt.show()

plot_graphs(history, 'accuracy')

In [None]:
#GENERATING NEW TEXT

#seed_text = input()
#ideally the seed_text should be user input, it's cute, but this sppeds things up a bit.

seed_text = 'I had you in my arms'
next_words = 100

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen= max_sequence_len - 1, padding = 'pre')
    predicted = model.predict_classes(token_list, verbose = 0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    print(seed_text)