In [2]:
data=open("lyrics.txt").read()  #load the input data

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,LSTM,Bidirectional,Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.utils as ku 
import numpy as np 

In [5]:
corpus=data.lower().split("\n") #splitting the data to form one line sentences

Tokenizing the sentences(converting words to number) and creating a list of X and y features where every sentence is broken into the number of words it contains

In [6]:
tokenizer=Tokenizer()

tokenizer.fit_on_texts(corpus)
total_words=len(tokenizer.word_index)+1

print(total_words)
input_lines=[]
for line in corpus:
  each_line=tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(each_line)):
    sequence=each_line[:i+1]
    input_lines.append(sequence)

7634


padding the sentences to make sure all the sentences are of same length

In [7]:
max_line_len=max([len(x) for x in input_lines])
input_lines=np.array(pad_sequences(input_lines,maxlen=max_line_len,padding="pre"))

#creating X(predictors) and Y(lables)
pred,labels=input_lines[:,:-1],input_lines[:,-1]
labels=ku.to_categorical(labels,num_classes=total_words)

setup the model (Feel free to tune the parameters)

In [8]:
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_line_len-1))
model.add(Bidirectional(LSTM(256,return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(total_words/2,activation="relu",kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words,activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="adam", metrics=["accuracy"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 23, 100)           763400    
_________________________________________________________________
bidirectional (Bidirectional (None, 23, 512)           731136    
_________________________________________________________________
dropout (Dropout)            (None, 23, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               328192    
_________________________________________________________________
dense (Dense)                (None, 3817)              492393    
_________________________________________________________________
dense_1 (Dense)              (None, 7634)              29146612  
Total params: 31,461,733
Trainable params: 31,461,733
Non-trainable params: 0
____________________________________________

In [9]:
 callback = EarlyStopping(monitor='loss', patience=5)  #training will stop if loss decreases for 5 consecutive times

train and save the model

In [None]:
history = model.fit(pred, labels,batch_size=50, epochs=250, verbose=2,callbacks=[callback])
model.save("lyricsgen.h5")

Plot the graphs to check the accuracy and Loss curve

In [None]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
loss = history.history['loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.title('Training accuracy')

plt.figure()

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.title('Training loss')
plt.legend()

plt.show()

Save the model into your systen

In [None]:
model.save('lyrics.h5')