first, import all the needed libraries for our project:

In [29]:
# Import the dependencies
import numpy as np
import pandas as pd
import sys 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import LSTM, Activation, Flatten, Dropout, Dense, Embedding, TimeDistributed, CuDNNLSTM
from tensorflow.keras.callbacks import ModelCheckpoint
!pip install np_utils
from tensorflow.keras.utils import to_categorical
from keras.layers.wrappers import Bidirectional
import matplotlib.pyplot as plt



Load the dataset :

In [30]:
#Load the dataset
dataset = pd.read_csv('taylor_swift_lyrics.csv', encoding = "latin1")
dataset.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


Concatenate the lines of each song to get each song by its own in one string:

In [31]:
def processFirstLine(lyrics, songID, songName, row):
    lyrics.append(row['lyric'] + '\n')
    songID.append( row['year']*100+ row['track_n'])
    songName.append(row['track_title'])
    return lyrics,songID,songName
# define empty lists for the lyrics , songID , songName 
lyrics = []
songID = []
songName = []
# songNumber indicates the song number in the dataset
songNumber = 1
# i indicates the song number
i = 0
isFirstLine = True
# Iterate through every lyrics line and join them together for each song independently 
for index,row in dataset.iterrows():
    if(songNumber == row['track_n']):
        if (isFirstLine):
            lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row)
            isFirstLine = False
        else :
            #if we still in the same song , keep joining the lyrics lines    
            lyrics[i] +=  row['lyric'] + '\n'
    #When it's done joining a song's lyrics lines , go to the next song :    
    else :
        lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row)
        songNumber = row['track_n']
        i+=1

#lyrics

Define a new pandas DataFrame to save songID , songName , Lyric

In [32]:
lyrics_data = pd.DataFrame({'songID':songID, 'songName':songName, 'lyrics':lyrics })

Now save the lyrics in a text file to use it in the LSTM RNN :

In [33]:
# Save Lyrics in .txt file
with open('chainsmokersLyricsText.txt', 'w',encoding="utf-8") as filehandle:  
    for listitem in lyrics:
        filehandle.write('%s\n' % listitem)

Define a new pandas DataFrame to save songID , songName , Lyric

After getting the wanted data from the dataset, we need to preprocess it.

#**Preprocessing The Lyrics**#

Based on the code written above, only the lyrics in "taylor_swift_lyrics.csv" is compiler in "lyricsText.txt", where, every line of a song is written in a new line, and evey new song is seperated by a line gap.

Now at this juncture, we can change the conetnts of the lyricsText.txt, to contain the songs of other artists as well for the lyrics generator to work for specific domains like "electronic music", "jazz", "pop", etc, it is recommended to clump the lyrics of artists of that particular genre only, to avoid generating ambiguous results

1- Convert the lyrics to lowercase :

In [35]:
# Load the dataset and convert it to lowercase :
textFileName = 'chainsmokersLyricsText.txt'
raw_text = open(textFileName, encoding = 'UTF-8').read()
raw_text = raw_text.lower()
#print(raw_text)

2- Mapping characters :
Make two dictionaries , one to convert chars to ints, the other to convert ints back to chars :

In [36]:
# Mapping chars to ints :
chars = sorted(list(set(raw_text)))
int_chars = dict((i, c) for i, c in enumerate(chars))
chars_int = dict((i, c) for c, i in enumerate(chars))

print(int_chars)
print(chars_int)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '&', 5: "'", 6: '(', 7: ')', 8: '+', 9: ',', 10: '-', 11: '.', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '7', 19: '8', 20: '9', 21: ':', 22: ';', 23: '<', 24: '>', 25: '?', 26: 'a', 27: 'b', 28: 'c', 29: 'd', 30: 'e', 31: 'f', 32: 'g', 33: 'h', 34: 'i', 35: 'j', 36: 'k', 37: 'l', 38: 'm', 39: 'n', 40: 'o', 41: 'p', 42: 'q', 43: 'r', 44: 's', 45: 't', 46: 'u', 47: 'v', 48: 'w', 49: 'x', 50: 'y', 51: 'z', 52: '\x85', 53: '\x91', 54: '\x93', 55: '\x94', 56: '\x96', 57: '\x97'}
{'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '(': 6, ')': 7, '+': 8, ',': 9, '-': 10, '.': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '<': 23, '>': 24, '?': 25, 'a': 26, 'b': 27, 'c': 28, 'd': 29, 'e': 30, 'f': 31, 'g': 32, 'h': 33, 'i': 34, 'j': 35, 'k': 36, 'l': 37, 'm': 38, 'n': 39, 'o': 40, 'p': 41, 'q': 42, 'r': 43, 's': 44, 't': 45, 'u': 46, 'v': 47, 'w': 48, 'x': 49, 'y': 50, 'z': 51, '\x85': 

Get number of chars and vocab in our text :

In [37]:
n_chars = len(raw_text)
n_vocab = len(chars)
print('Total Characters : ' , n_chars) # number of all the characters in lyricsText.txt
print('Total Vocab : ', n_vocab) # number of unique characters

Total Characters :  173698
Total Vocab :  58


3- Make samples and labels :
Make samples and labels to feed the LSTM RNN

In [38]:
# process the dataset:
seq_len = 100
data_X = []
data_y = []
for i in range(0, n_chars - seq_len, 1):
    # Input Sequeance(will be used as samples)
    seq_in  = raw_text[i:i+seq_len]
    # Output sequence (will be used as target)
    seq_out = raw_text[i + seq_len]
    # Store samples in data_X
    data_X.append([chars_int[char] for char in seq_in])
    # Store targets in data_y
    data_y.append(chars_int[seq_out])
n_patterns = len(data_X)
print( 'Total Patterns : ', n_patterns)

#print(seq_in)
print(seq_out)
print(data_X)
#print(data_y)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






4- Prepare the samples and labels :
prepare the samples and labels to be ready to go into our model.
1. Reshape the samples
2. Normalize them
3. One hot encode the output targets

In [39]:
# Reshape X to be suitable to go into LSTM RNN :
X = np.reshape(data_X , (n_patterns, seq_len, 1))
# Normalizing input data :
X = X/ float(n_vocab)
# One hot encode the output targets :
y = to_categorical(data_y)

print(X.shape)
print(y.shape)


(173598, 100, 1)
(173598, 58)


After we finished processing the dataset , we will start building our LSTM RNN model .

#**Building The Model :**#

We will start by determining how many layers our model will has , and how many nodes each layer will has :

In [40]:
LSTM_layer_num = 4 # number of LSTM layers
layer_size = [256,256,256,256] # number of nodes in each layer

Define a sequential model :

In [41]:
model = keras.Sequential()

LSTM layer VS CuDNNLSTM layer :
The main difference is that LSTM uses the CPU and CuDNNLSTM uses the GPU , that’s why CuDNNLSTM is much faster than LSTM , it is x15 faster.
This is the reason that made me use CuDNNLTSM instead of LSTM .
Note : make sure to change the run time setting of colab to use its GPU .


Add an input layer :

In [42]:
model.add(CuDNNLSTM(layer_size[0], input_shape =(X.shape[1], X.shape[2]), return_sequences = True))

Add some hidden layers :

In [43]:
for i in range(1,LSTM_layer_num) :
    model.add(Bidirectional(CuDNNLSTM(layer_size[i], return_sequences=True)))
    #model.add(Dropout(0.2))......ex(2)
model.add(Dropout(0.2)) #......ex(3)

Flatten the data that is coming from the last hidden layer to input it to the output layer :

In [44]:
model.add(Flatten())

Add an output layer and define its activation function to be ‘softmax’
and then compile the model with the next params :
1. loss = ‘categorical_crossentropy’
2. optimizer = ‘adam’

In [45]:
model.add(Dense(y.shape[1],activation='softmax'))
#model.add(Dropout(0.2))......ex(1)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

Print a summary of the model to see some details :

In [46]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cu_dnnlstm_4 (CuDNNLSTM)    (None, 100, 256)          265216    
                                                                 
 bidirectional_3 (Bidirectio  (None, 100, 512)         1052672   
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, 100, 512)         1576960   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 100, 512)         1576960   
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 100, 512)          0         
                                                      

After we defined the model , we will define the needed callbacks.

A callback is a function that is called after every epoch
in our case we will call the checkpoint callback , what a checkpoint callback does is saving the weights of the model every time the model gets better.

In [47]:
# Configure the checkpoint :
checkpoint_name = 'Weights-LSTM-improvement-{epoch:03d}-{loss:.5f}-bigger.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='loss', verbose = 1, save_best_only = True, mode ='min')
callbacks_list = [checkpoint]


#**Training**#

In [25]:
# Fit the model :
model_params = {'epochs':30,
                'batch_size':128,
                'callbacks':callbacks_list,
                'verbose':1,
                'validation_split':0.2,
                'validation_data':None,
                'shuffle': True,
                'initial_epoch':0,
                'steps_per_epoch':None,
                'validation_steps':None}

history = model.fit(X,
          y,
          epochs = model_params['epochs'],
           batch_size = model_params['batch_size'],
           callbacks= model_params['callbacks'],
           verbose = model_params['verbose'],
           validation_split = model_params['validation_split'],
           validation_data = model_params['validation_data'],
           shuffle = model_params['shuffle'],
           initial_epoch = model_params['initial_epoch'],
           steps_per_epoch = model_params['steps_per_epoch'],
           validation_steps = model_params['validation_steps'])

Epoch 1/30


KeyboardInterrupt: ignored

In [50]:
history = model.fit(X,
          y,
          epochs = model_params['epochs'],
           batch_size = model_params['batch_size'],
           callbacks= model_params['callbacks'],
           verbose = model_params['verbose'],
           validation_split = model_params['validation_split'],
           validation_data = model_params['validation_data'],
           shuffle = model_params['shuffle'],
           initial_epoch = model_params['initial_epoch'],
           steps_per_epoch = model_params['steps_per_epoch'],
           validation_steps = model_params['validation_steps'])

Epoch 1/30
  69/1085 [>.............................] - ETA: 2:19 - loss: 2.7756

KeyboardInterrupt: ignored

 **Load the Weights**

In [52]:
# Load wights file :
weights_file = '/content/Weights-LSTM-improvement-029-0.04040-bigger.hdf5' # weights file path
model.load_weights(weights_file)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

#**Generating lyrics**#
We first pick a random seed , then we will use it to generate lyrics character by character .

In [53]:
# set a random seed :
start = np.random.randint(0, len(data_X)-1)
pattern = data_X[start]
print('Seed : ')
print("\"",''.join([int_chars[value] for value in pattern]), "\"\n")
# How many characters you want to generate
generated_characters = 500
# Generate Charachters :
for i in range(generated_characters):
    x = np.reshape(pattern, ( 1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x,verbose = 0)
    index = np.argmax(prediction)
    result = int_chars[index]
    #seq_in = [int_chars[value] for value in pattern]

    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone')

print(pattern)

Seed : 
"  swallowing my pride
standing in front of you saying i'm sorry for that night
and i'd go back to dec "

ember...
it turns out freedom ain't nothing but missing you
wishing i'd realizedd halling start this eeel mine
i'l go in your whcte's all reml me mame a paint in the criving you oever i'ml be
bry on uhe only is aack around
i stay i'm not i'l never know
that you tay you had wat the ptt of ne
luch ne
i knew you seml tr me, oh
one noe, to me
maughng oo the swees tear eack and i lnow standing in a night to wous fine is red fere weat i wanna firl, i said
you feel oo meave he's alone,vay that you're g
Done
[33, 45, 1, 45, 40, 1, 48, 40, 46, 44, 1, 31, 34, 39, 30, 1, 34, 44, 1, 43, 30, 29, 1, 31, 30, 43, 30, 1, 48, 30, 26, 45, 1, 34, 1, 48, 26, 39, 39, 26, 1, 31, 34, 43, 37, 9, 1, 34, 1, 44, 26, 34, 29, 0, 50, 40, 46, 1, 31, 30, 30, 37, 1, 40, 40, 1, 38, 30, 26, 47, 30, 1, 33, 30, 5, 44, 1, 26, 37, 40, 39, 30, 9, 47, 26, 50, 1, 45, 33, 26, 45, 1, 50, 40, 46, 5, 43, 30, 1, 32]


In [None]:
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()