In [56]:
import io
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation
import sys
import warnings
warnings.filterwarnings("ignore")

In [112]:
source = 'avicii_lyrics.txt'
with io.open(source, 'r') as corpus:
# Read the contents of the file and convert the text to lowercase
    corpus = corpus.read().lower()

corpus[:250]

"feeling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to start\n\nthey tell me i'm too young to understand they say i'm caught up in a dream well life will pass me by if i don't open up m"

In [113]:
corpus = corpus.replace('â€™',' a')
corpus = corpus.replace(';','?')
symbols_to_remove = ['¶', '˜','¦', 'â', 'ã', '€','¤','™']

for symbol in symbols_to_remove:
    corpus = corpus.replace(symbol, '')

corpus[:250]

"feeling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to start\n\nthey tell me i'm too young to understand they say i'm caught up in a dream well life will pass me by if i don't open up m"

In [114]:
# Get the Unique characters from all text corpus
chars = sorted(list(set(corpus)))
chars_lenght = len(chars)
chars_lenght

51

In [115]:
chars

['\n',
 ' ',
 '!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '?',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '”']

<span style="color:red; font-weight:bold; font-size:larger;">**Mapping Characters to Indexes and Versa for Better Training**</span><br><br>
**When we work with models that analyze text, such as an LSTM model, we need a method to change the text into numbers. This helps the model understand and work with the text.**<br><br>
<span style='color:green'>**We use a process called mapping**</span> **, which involves assigning unique numbers to each character in the text. This mapping allows the model to learn patterns and relationships in the text.**<br>
<span style='color:green'>**Mapping characters to numbers also helps with efficient indexing and searching**</span>**. In LSTM models, we often need to find and change specific characters in the text. By creating dictionaries that connect characters with their corresponding numbers, we can quickly find the number for a character or retrieve the character for a given number. This saves time and avoids searching through the entire text repeatedly.**<br>
<span style='color:green'>**The dictionaries we create also help us manage the vocabulary of the model**</span>**. By mapping characters to numbers, we establish a consistent way to represent the vocabulary. This makes it easier to add or change words in the vocabulary when necessary. The dictionaries keep track of which number corresponds to which character, making it simpler to handle the model's vocabulary.**<br><br>
**In summary, by creating dictionaries that map characters to numbers, we provide a useful way to represent, process, and change text data in models like LSTM. These dictionaries are important for converting text into numbers, efficient indexing and searching, and effective management of the model's vocabulary.**

In [116]:
"""
Create dictionaries mapping characters to indices and versa.

Args:
    chars (list): List of unique characters.

Returns:
    dict: Dictionary mapping characters to indexes.
    dict: Dictionary mapping indexes to characters.
"""
def create_dictionaries(chars):

    return dict((c, i) for i, c in enumerate(chars)), dict((i, c) for i, c in enumerate(chars))

char_to_index, index_to_char =  create_dictionaries(chars)
char_to_index

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '4': 14,
 '5': 15,
 '6': 16,
 '7': 17,
 '8': 18,
 '9': 19,
 ':': 20,
 '?': 21,
 '[': 22,
 ']': 23,
 'a': 24,
 'b': 25,
 'c': 26,
 'd': 27,
 'e': 28,
 'f': 29,
 'g': 30,
 'h': 31,
 'i': 32,
 'j': 33,
 'k': 34,
 'l': 35,
 'm': 36,
 'n': 37,
 'o': 38,
 'p': 39,
 'q': 40,
 'r': 41,
 's': 42,
 't': 43,
 'u': 44,
 'v': 45,
 'w': 46,
 'x': 47,
 'y': 48,
 'z': 49,
 '”': 50}

<span style="color:red; font-weight:bold; font-size:larger;">**Prepare the Data for the LSTM model**</span><br>

In [117]:
"""
Create sequences and labels from the given text.

Args:
    corpus (str): The input text corpus.
    window_size (int): Length of each sequence.
    step (int): Number of steps between each sequence.

Returns:
    list: List of sequences (windows).
    list: List of labels.
"""
def create_sequences(corpus, window_size, step):

    windows = []
    labels = []
    # Generate sequences and labels
    for i in range(0, len(corpus) - window_size, step):
        windows.append(corpus[i: i + window_size])
        labels.append(corpus[i + window_size])

    return windows, labels

window_size = 120
step = 1

values, labels = create_sequences(corpus, window_size, step)
print('\nCharacters in windows:\n', values[:15], '\n') # Characters of window 
print('\nNext Character foreach window:\n', labels[:15]) # Next character of window


Characters in windows:
 ["feeling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where t", "eeling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to", "eling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to ", "ling my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to s", "ing my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to st", "ng my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to sta", "g my way through the darkness guided by a beating heart i can't tell where the journey will end but i know where to star", " my way through the darkness guided by a beating heart i can't tell where the journey will end but i know

<span style="color:red; font-weight:bold; font-size:larger;">**One Hot Encode: Convert sequences and labels to one-hot arrays**</span><br><br>
**When we convert sequences and labels into one-hot arrays, we create a useful numerical format for the LSTM model. This format helps the model understand and learn from the data we provide, including both the input sequences and the target labels. In one-hot arrays, each character or label is represented as a binary vector. Each element in the vector tells us whether a particular character or label is present or not. This binary representation helps the model process and make sense of the data effectively.**

In [118]:
"""
Convert sequences and labels to one-hot arrays.

Args:
    sequences (list): List of sequences of characters (values).
    window_size (int): Length of each sequence.
    chars_length (int): Length of the array of unique characters.
    char_to_index (list): A dictionary list mapping unique characters to their indices.
    labels (list): List of labels.

Returns:
    np.ndarray: Array of sequences encoded as one-hot arrays.
    np.ndarray: Array of labels encoded as one-hot arrays.

A One-Hot array is represented in the following form:

                      a b c d.....
                     [1 0 0 0.....]
                     [0 1 0 0.....]
                     [0 0 1 0.....]
"""
def convert_to_one_hot_arrays(sequences, window_size, chars_length, char_to_index, labels):
    
    # Create empty arrays of appropriate shape for sequences (x) and labels (y)
    x = np.zeros((len(sequences), window_size, chars_length), dtype=np.bool)
    y = np.zeros((len(sequences), chars_length), dtype=np.bool)

    # Iterate over each sequence and its corresponding label
    for i, sentence in enumerate(sequences):
        # Encode each character in the sequence as a one-hot vector
        for j, char in enumerate(sentence):
            x[i, j, char_to_index[char]] = 1
        # Encode the label as a one-hot vector
        y[i, char_to_index[labels[i]]] = 1

    return x, y

X, y = convert_to_one_hot_arrays(values, window_size, chars_lenght, char_to_index, labels)
X.shape, y.shape

((71933, 120, 51), (71933, 51))

In [119]:
y[0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False])

<span style="color:red; font-weight:bold; font-size:larger;">**Create the model**</span><br>

In [65]:
model = Sequential()
# LSTM input shape: [samples, time steps, features]
model.add(LSTM(1024, input_shape=(window_size, chars_lenght)))
# Dense layer to capture patterns in the text data
# We put the Dense layer to train our model with some patterns we have in text like every time we have ',' the next character is ' '
model.add(Dense(64))
# Output layer with the same number of units as the length of the unique characters
model.add(Dense(chars_lenght))
# Use softmax activation because it is a non-linear function and we are using a categorical loss function
model.add(Activation('softmax'))

# Compile the model using RMSprop optimizer and categorical cross-entropy loss
# RMSprop is a typical optimazer for categorical crossentropy problems
model.compile(optimizer='RMSprop', loss='categorical_crossentropy')

# Print the summary of the model architecture
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 1024)              4407296   
                                                                 
 dense_6 (Dense)             (None, 64)                65600     
                                                                 
 dense_7 (Dense)             (None, 51)                3315      
                                                                 
 activation_3 (Activation)   (None, 51)                0         
                                                                 
Total params: 4,476,211
Trainable params: 4,476,211
Non-trainable params: 0
_________________________________________________________________


In [66]:
# Fit the model to the data
model.fit(X, y, batch_size=512, epochs=25)

# Save the model weights to a file
model.save_weights('model_weights_25epochs.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [135]:
"""
Generate text based on a given model and initial text.

Args:
    model (Sequential): The trained LSTM model.
    text (str): The initial text to start generating from.
    window_size (int): The size of the window for generating sequences.
    chars_length (int): The length of the array of unique characters.
    char_to_index (dict): A dictionary mapping unique characters to their indices.
    index_to_char (dict): A dictionary mapping indices to their corresponding characters.

Returns:
    str: The generated text.
"""

def generate_text(model, text, window_size, chars_length, char_to_index, index_to_char):
    
    # Convert the text to lowercase
    text = text.lower()
    # Print the initial text
    sys.stdout.write(text)

    # Generate the next 200 characters based on the text
    for i in range(200):
        X = np.zeros((1, window_size, chars_length))

        # Convert the characters in the text to one-hot arrays
        for j, character in enumerate(text):
            X[0, j, char_to_index[character]] = 1.

        # Predict the next character based on the one-hot encoded seed
        predictions = model.predict(X, verbose=0)[0]
        index = np.argmax(predictions)

        # Convert the predicted index back to the corresponding character
        next_char = index_to_char[index]

        # Update the seed by removing the first character and appending the next character
        text = text[1:] + next_char

        # Print the next character
        sys.stdout.write(next_char)
        sys.stdout.flush()

    return text

**The LSTM model was Trained for 12 Epochs | Taking 9.4 hours to Run**

In [144]:
text=" They tell me I'm too young to understand They say I'm caught up in a dream Well life will pass me by if I don't open up"

generate_text(model, text, window_size, chars_lenght, char_to_index, index_to_char)

 they tell me i'm too young to understand they say i'm caught up in a dream well life will pass me by if i don't open up my eyes
well that's fine by me


st wakh me hemp fightrenter

all the ting to the with you ball rang
these are the days we won't rogher
tell me ahay you see love i wanna love ya,
i wanna love ya, lik

**The LSTM model was Trained for 12 Epochs | Taking 19 hours to Run**

In [136]:
text=" They tell me I'm too young to understand They say I'm caught up in a dream Well life will pass me by if I don't open up"

generate_text(model, text, window_size, chars_lenght, char_to_index, index_to_char)

 they tell me i'm too young to understand they say i'm caught up in a dream well life will pass me by if i don't open up my eyes
well that's fine by me


so wake me up when it's all over
when i'm wiser and i'm older
all this time i was finding myself and i dill on your love and it has it all just fades away
it all just

**When comparing the same model trained for different epochs, the LSTM model trained for 25 epochs showed better results. However, it still couldn't completely close the difference of all 150 letters between the predicted and actual text. In conlution, even though the LSTM model improved a lot, it couldn't exactly match the expected and observed lyrics for the whole sequence of 200 letters.**<br><br>
**The lyrics is below**

![](Screenshot_4.jpg)