# Song Lyrics Generator

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Extraction

### Libraries

In [None]:
import pandas as pd
import numpy as np
import string

### Data Extraction

In [None]:
df = pd.read_csv('./data/lyrics.csv', sep = "\t")
df = df.dropna(subset=['lyrics'])
df.head()

Unnamed: 0,song_id,lyrics
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."
5,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma..."
6,5hslUAKq9I9CG2bAulFkHN,['[Chorus]\nIt\'s the most wonderful time of t...


In [None]:
pdf = pd.read_csv('./data/PoetryFoundationData.csv', quotechar='"')
pdf.head()

Unnamed: 0,Title,Poem,Poet,Tags
0,\r\n\r\n Objects Used to Pr...,"\r\n\r\nDog bone, stapler,\r\n\r\ncribbage boa...",Michelle Menting,
1,\r\n\r\n The New Church\r\n...,\r\n\r\nThe old cupola glinted above the cloud...,Lucia Cherciu,
2,\r\n\r\n Look for Me\r\n\r\...,\r\n\r\nLook for me under the hood\r\n\r\nof t...,Ted Kooser,
3,\r\n\r\n Wild Life\r\n\r\n ...,"\r\n\r\nBehind the silo, the Mother Rabbit\r\n...",Grace Cavalieri,
4,\r\n\r\n Umbrella\r\n\r\n ...,\r\n\r\nWhen I push your button\r\n\r\nyou fly...,Connie Wanek,


## Cleansing the data

In [None]:
# This translator is used to remove punctuation from the text
translator = str.maketrans('', '', string.punctuation)

### Cleansing Lyrics

In [None]:
def split_text(x):
    text = x['lyrics'] # get the lyrics
#    print(text)
    sections = text.split('\\n\\n') # split the lyrics into sections
    # for s in sections:
        # print(s)
    keys = {'Verse 1': np.nan,'Verse 2':np.nan,'Verse 3':np.nan,'Verse 4':np.nan, 'Chorus':np.nan, 'Intro': np.nan}
    lyrics = str()
    single_text = []
    res = {}
    for s in sections:
        key = s[s.find('[') + 1:s.find(']')].strip()
        if len(key)>3 and key[0] == '"':
            key = key[2:]
        if ':' in key:
           key = key[:key.find(':')]
        # print(key)
        if key in keys:
           single_text += [x.lower().replace('(','').replace(')','').translate(translator) for x in s[s.find(']')+1:].split('\\n') if len(x) > 1]

    # print(single_text)

        res['single_text'] =  ' \n '.join(single_text)
    # print(res)
    return pd.Series(res)

In [None]:
df = df.join( df.apply(split_text, axis=1))
df.head()

Unnamed: 0,song_id,lyrics,single_text
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro...",found you when your heart was broke \n i fille...
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...",astro yeah \n sun is down freezin cold \n that...
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...",high high hopes \n had to have high high hopes...
5,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma...",iii dont want a lot for christmas \n there is ...
6,5hslUAKq9I9CG2bAulFkHN,['[Chorus]\nIt\'s the most wonderful time of t...,its the haphappiest season of all \n with thos...


In [None]:
print(df.shape)
df.dropna(subset=['single_text'], inplace=True)
print(df.shape)

(19662, 3)
(19662, 3)


#### Saving the cleansed data

In [None]:
df.to_csv('./data/lyrics_clean.csv', sep = "\t", index = False)

#### Loading the cleansed data

In [None]:
df = pd.read_csv('./data/lyrics_clean.csv', sep = "\t")

### Cleansing Poems

In [None]:
# remove /r from the text
pdf['Poem'] = pdf['Poem'].apply(lambda x: x.replace('\r', ''))
# replace \n\n with \n
pdf['Poem'] = pdf['Poem'].apply(lambda x: x.replace('\n\n', '\n'))
pdf['Poem'] = pdf['Poem'].apply(lambda x: x.replace('\t', '\n'))
# remove first \n
pdf['Poem'] = pdf['Poem'].apply(lambda x: x[1:] if x[0] == '\n' else x)
# remove lines with less than 2 characters
# pdf['Poem'] = pdf['Poem'].apply(lambda x: '\n'.join([l for l in x.splitlines() if len(l)>1]))
pdf['single_text'] = pdf['Poem'].apply(lambda x: ' \n '.join([l.lower().strip().translate(translator) for l in x.splitlines() if len(l)>0]))
# pdf.head()
print(pdf['Poem'][0])
print(pdf['single_text'][0])

0    Dog bone, stapler,\ncribbage board, garlic pre...
0    At the high school football game, the boys\nst...
0                                               #1 ...
0    The truth is, I’ve never cared for the Nationa...
0    Part of suffering is the useless urge to annou...
                           ...                        
0    They eat beans mostly, this old yellow pair.  ...
0    The accumulation of reefs\npiling up one over ...
0              Philosophic\nin its complex, ovoid e...
0                                                     
0              Philosophic\nin its complex, ovoid e...
Name: Poem, Length: 100, dtype: object
0    dog bone stapler \n cribbage board garlic pres...
0    at the high school football game the boys \n s...
0    1 college \n  \n we packed your satchel with s...
0    the truth is i’ve never cared for the national...
0    part of suffering is the useless urge to annou...
                           ...                        
0    they eat beans mostly

### Combinind the cleansed data

In [None]:
sum_df = pd.DataFrame(df['single_text'])
# sum_df = pd.concat([sum_df, pd.DataFrame(pdf['single_text'])], ignore_index=True)
sum_df.dropna(inplace=True)

#### Saving the combined data

In [None]:
sum_df.to_csv('./data/sum_data.csv', sep = "\t", index = False)

#### Loading the combined data

In [None]:
sum_df = pd.read_csv('./data/sum_data.csv', sep = "\t")

In [None]:
sum_df.head()

Unnamed: 0,single_text
0,found you when your heart was broke \n i fille...
1,astro yeah \n sun is down freezin cold \n that...
2,high high hopes \n had to have high high hopes...
3,iii dont want a lot for christmas \n there is ...
4,its the haphappiest season of all \n with thos...


## Recurrent Neural Network

### Finding the unique words

In [None]:
text_as_list = []

def extract_text(text):
   global text_as_list
   text_as_list += [w for w in text.split(' ') if w.strip() != '' or w == '\n']

sum_df['single_text'].apply(extract_text)

print("Total words " , len(text_as_list))

Total words  2634181


In [None]:
print(text_as_list[1000:2000])

['til', 'i', 'land', '\n', 'had', 'me', 'out', 'like', 'a', 'light', 'like', 'a', 'light', '\n', 'like', 'a', 'light', 'like', 'a', 'light', '\n', 'like', 'a', 'light', 'like', 'a', 'light', '\n', 'like', 'a', 'light', '\n', 'yeah', 'passed', 'the', 'dawgs', 'a', 'celly', '\n', 'sendin', 'texts', 'aint', 'sendin', 'kites', 'yeah', '\n', 'he', 'said', 'keep', 'that', 'on', 'lock', '\n', 'i', 'say', 'you', 'know', 'this', 'shit', 'its', 'stife', 'yeah', '\n', 'its', 'absolute', 'yeah', 'yeah', 'im', 'back', 'reboot', 'its', 'lit', '\n', 'laferrari', 'to', 'jamba', 'juice', 'yeah', 'skrrt', 'skrrt', '\n', 'we', 'back', 'on', 'the', 'road', 'they', 'jumpin', 'off', 'no', 'parachute', 'yeah', '\n', 'shawty', 'in', 'the', 'back', '\n', 'she', 'said', 'she', 'workin', 'on', 'her', 'glutes', 'yeah', 'oh', 'my', 'god', '\n', 'aint', 'by', 'the', 'book', 'yeah', 'this', 'how', 'it', 'look', 'yeah', '\n', 'bout', 'a', 'check', 'yeah', 'check', 'just', 'check', 'the', 'foots', 'yeah', '\n', 'pass'

In [None]:
freq = {}

for w in text_as_list:
    if w in freq:
        freq[w] += 1
    else:
        freq[w] = 1

print("Unique Words " , len(freq))


Unique Words  37659


#### Finding Common Words

In [None]:
uncommon_words = set([key for key in freq.keys() if freq[key] < 7]) # thala for a reason
words = sorted(set([key for key in freq.keys() if freq[key] >= 7])) # thala for a reason

print("Uncommon Words " , len(uncommon_words))
print("Common Words " , len(words))

Uncommon Words  27953
Common Words  9706


In [None]:
word_indices = dict((w, i) for i, w in enumerate(words))
indices_word = dict((i, w) for i, w in enumerate(words))

#### Forming the sequences

In [None]:
MIN_SEQ = 5

valid_seqs = []
end_seq_words = []
for i in range(len(text_as_list) - MIN_SEQ ):
   end_slice = i + MIN_SEQ + 1
   if len( set(text_as_list[i:end_slice]).intersection(uncommon_words) ) == 0:
       valid_seqs.append(text_as_list[i: i + MIN_SEQ])
       end_seq_words.append(text_as_list[i + MIN_SEQ])

In [None]:
print("Valid Sequences " , len(valid_seqs))
print("End Words " , len(end_seq_words))

for i in range(10):
    print(valid_seqs[i], " ", end_seq_words[i])

Valid Sequences  2366419
End Words  2366419
['found', 'you', 'when', 'your', 'heart']   was
['you', 'when', 'your', 'heart', 'was']   broke
['when', 'your', 'heart', 'was', 'broke']   

['your', 'heart', 'was', 'broke', '\n']   i
['heart', 'was', 'broke', '\n', 'i']   filled
['was', 'broke', '\n', 'i', 'filled']   your
['broke', '\n', 'i', 'filled', 'your']   cup
['\n', 'i', 'filled', 'your', 'cup']   until
['i', 'filled', 'your', 'cup', 'until']   it
['\n', 'took', 'it', 'so', 'far']   to


### Splitting the data

In [None]:
# Libraries
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(valid_seqs, end_seq_words, test_size=0.02, random_state=42)

In [None]:
print("Train Size ", len(X_train))
print("Test Size ", len(X_test))

Train Size  2319090
Test Size  47329


#### Save the split data

In [None]:
import pickle

In [None]:
data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'word_indices': word_indices, 'indices_word': indices_word}

with open('./data/data.pkl', 'wb') as f:
    pickle.dump(data, f)

NameError: name 'X_train' is not defined

#### Load the split data

In [None]:
data = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/data.pkl', 'rb'))

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
word_indices = data['word_indices']
indices_word = data['indices_word']

### Data generator

In [None]:
def generator(sentence_list, next_word_list, batch_size):
   index = 0
   # This loop will keep generating training data indefinitely
   while True:
       x = np.zeros((batch_size, MIN_SEQ), dtype=np.int32)
       y = np.zeros((batch_size), dtype=np.int32)
       for i in range(batch_size):
           for t, w in enumerate(sentence_list[index % len(sentence_list)]):
               x[i, t] = word_indices[w]
           y[i] = word_indices[next_word_list[index % len(sentence_list)]]
           index = index + 1
       yield x, y

In [None]:
def sample(preds, temperature=1.0):
   # helper function to sample an index from a probability array
   preds = np.asarray(preds).astype('float64')
   preds = np.log(preds) / temperature
   exp_preds = np.exp(preds)
   preds = exp_preds / np.sum(exp_preds)
   probas = np.random.multinomial(1, preds, 1)
   return np.argmax(probas)

In [None]:
def on_epoch_end(epoch, logs):
   # Function invoked at end of each epoch. Prints generated text.
   examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)
   # Randomly pick a seed sequence
   seed_index = np.random.randint(len(X_train+X_test))
   seed = (X_train+X_test)[seed_index]

   for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
       sentence = seed
       examples_file.write('----- Diversity:' + str(diversity) + '\n')
       examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
       examples_file.write(' '.join(sentence))
       for i in range(50):
           x_pred = np.zeros((1, MIN_SEQ))
           for t, word in enumerate(sentence):
               x_pred[0, t] = word_indices[word]
           preds = model.predict(x_pred, verbose=0)[0]
           next_index = sample(preds, diversity)
           next_word = indices_word[next_index]

           sentence = sentence[1:]
           sentence.append(next_word)

           examples_file.write(" "+next_word)
       examples_file.write('\n')
   examples_file.write('='*80 + '\n')
   examples_file.flush()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Activation, Bidirectional
from tensorflow.keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from __future__ import print_function


In [None]:
def get_model():
   print('Build model...')
   model = Sequential()
   model.add(Embedding(input_dim=len(word_indices), output_dim=1024))
   model.add(Bidirectional(LSTM(128)))
   model.add(Dense(len(word_indices)))
   model.add(Activation('softmax'))
   return model

In [None]:
BATCH_SIZE = 1024
MIN_FREQUENCY = 7
MIN_SEQ = 5

In [None]:
model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
           "loss{loss:.4f}-acc{accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_accuracy:.4f}" % \
           (len(word_indices), MIN_SEQ, MIN_FREQUENCY) + ".keras"
checkpoint = ModelCheckpoint(file_path, monitor='val_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20)
callbacks_list = [checkpoint, print_callback, early_stopping]
examples_file = open('examples.txt', "w")
model.fit(generator(X_train, y_train, BATCH_SIZE),
                   steps_per_epoch=int((len(X_train)+len(X_test))/BATCH_SIZE) + 1,
                   epochs=20,
                   callbacks=callbacks_list,
                   validation_data=generator(X_test, y_train, BATCH_SIZE),
                   validation_steps=int(len(y_train)/BATCH_SIZE) + 1)

Build model...
Epoch 1/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 65ms/step - accuracy: 0.1585 - loss: 5.5323 - val_accuracy: 0.0488 - val_loss: 7.3060
Epoch 2/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 64ms/step - accuracy: 0.2313 - loss: 4.4559 - val_accuracy: 0.0429 - val_loss: 7.7660
Epoch 3/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 64ms/step - accuracy: 0.2648 - loss: 4.1029 - val_accuracy: 0.0399 - val_loss: 8.0809
Epoch 4/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 63ms/step - accuracy: 0.2901 - loss: 3.8750 - val_accuracy: 0.0375 - val_loss: 8.3107
Epoch 5/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 63ms/step - accuracy: 0.3108 - loss: 3.7039 - val_accuracy: 0.0367 - val_loss: 8.5351
Epoch 6/20
[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 64ms/step - accuracy: 0.3288 - loss: 3.5648 - val_accuracy: 0.0351 - 

  preds = np.log(preds) / temperature


[1m2311/2311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 64ms/step - accuracy: 0.4646 - loss: 2.6944 - val_accuracy: 0.0294 - val_loss: 10.4757


<keras.src.callbacks.history.History at 0x7faaf02eacb0>

In [None]:
model.save('lyrics_model.keras')

In [None]:
# prompt: write a line to check how the model is working

model.evaluate(generator(X_test, y_test, BATCH_SIZE), steps=int(len(y_test)/BATCH_SIZE) + 1)




[4.013584613800049, 0.3571517765522003]

In [None]:
# Load the saved model
model = tf.keras.models.load_model('lyrics_model.keras')


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Custom input line
custom_line = "I am just a normal guy"

# This translator is used to remove punctuation from the text
translator = str.maketrans('', '', string.punctuation)

# Tokenize the custom line
custom_line_tokens = custom_line.lower().translate(translator).split()

# Convert tokens to indices
seed = [word_indices[word] for word in custom_line_tokens if word in word_indices]

# Pad the seed to match the expected input length
while len(seed) < MIN_SEQ:
    seed.insert(0, 0)  # Pad with zeros

# Generate lyrics
generated_lyrics = ' '.join(custom_line_tokens)
for i in range(50):  # Generate 50 more words
    x_pred = np.zeros((1, MIN_SEQ+1))
    for t, word_index in enumerate(seed):
        x_pred[0, t] = word_index
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 0.5)  # Adjust diversity as needed
    next_word = indices_word[next_index]
    generated_lyrics += ' ' + next_word
    seed = seed[1:]
    seed.append(next_index)

print(generated_lyrics)


i am just a normal guy 
 i could just dial the phone now 
 just a girl like you 
 and i like the way i want it 
 yeah yeah yeah 
 i got a ticket for a little while she goes 
 a little bit of pain left me 
 and i cant


## Cells to run

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import string

In [None]:
# This translator is used to remove punctuation from the text
translator = str.maketrans('', '', string.punctuation)

In [None]:
import pickle

In [None]:
data = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/data.pkl', 'rb'))

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
word_indices = data['word_indices']
indices_word = data['indices_word']

In [None]:
BATCH_SIZE = 1024
MIN_FREQUENCY = 7
MIN_SEQ = 5

In [None]:
import tensorflow as tf

In [None]:
# Load the saved model
model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/lyrics_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
def sample(preds, temperature=1.0):
   # helper function to sample an index from a probability array
   preds = np.asarray(preds).astype('float64')
   preds = np.log(preds) / temperature
   exp_preds = np.exp(preds)
   preds = exp_preds / np.sum(exp_preds)
   probas = np.random.multinomial(1, preds, 1)
   return np.argmax(probas)

In [None]:
# Custom input line
custom_line = "I am just a normal guy"

# This translator is used to remove punctuation from the text
translator = str.maketrans('', '', string.punctuation)

# Tokenize the custom line
custom_line_tokens = custom_line.lower().translate(translator).split()

# Convert tokens to indices
seed = [word_indices[word] for word in custom_line_tokens if word in word_indices]

# Pad the seed to match the expected input length
while len(seed) < MIN_SEQ:
    seed.insert(0, 0)  # Pad with zeros

# Generate lyrics
generated_lyrics = ' '.join(custom_line_tokens)
for i in range(50):  # Generate 50 more words
    x_pred = np.zeros((1, MIN_SEQ+1))
    for t, word_index in enumerate(seed):
        x_pred[0, t] = word_index
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 0.1)  # Adjust diversity as needed
    next_word = indices_word[next_index]
    generated_lyrics += ' ' + next_word
    seed = seed[1:]
    seed.append(next_index)

print(generated_lyrics)

i am just a normal guy 
 i just want to think about you 
 i know you know i love it 
 i dont want to lose your mind 
 but i dont know why i dont even know you 
 but i know that i aint never been 
 i aint the way it
