# Data Preprocessing

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [4]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')

train

Unnamed: 0,Artist,Song,Genre,Language,Lyrics
0,12 stones,world so cold,Rock,en,"It starts with pain, followed by hate\nFueled ..."
1,12 stones,broken,Rock,en,Freedom!\nAlone again again alone\nPatiently w...
2,12 stones,3 leaf loser,Rock,en,"Biting the hand that feeds you, lying to the v..."
3,12 stones,anthem for the underdog,Rock,en,You say you know just who I am\nBut you can't ...
4,12 stones,adrenaline,Rock,en,My heart is beating faster can't control these...
...,...,...,...,...,...
290178,bobby womack,i wish he didn t trust me so much,R&B,en,I'm the best friend he's got I'd give him the ...
290179,bad boys blue,i totally miss you,Pop,en,"Bad Boys Blue ""I Totally Miss You"" I did you w..."
290180,celine dion,sorry for love,Pop,en,Forgive me for the things That I never said to...
290181,dan bern,cure for aids,Indie,en,The day they found a cure for AIDS The day the...


In [None]:
train['Genre'].value_counts()

Rock          121404
Pop           108714
Metal          20291
Jazz           13545
Folk            8644
Indie           8449
R&B             2793
Hip-Hop         2240
Electronic      2213
Country         1890
Name: Genre, dtype: int64

In [5]:
en_data = train[train['Language']=='en']
rock = en_data[en_data['Genre']=='Rock']

rock.shape

(107145, 5)

In [6]:
print(rock.isnull().sum())
rock = rock[rock['Lyrics'].notna()]


Artist      0
Song        1
Genre       0
Language    0
Lyrics      0
dtype: int64


In [7]:
rock_sub = rock.sample(n=500)

rock_sub.shape

(500, 5)

In [14]:
# convert all the lyrics into one string

lyrics_str = ' '.join(rock_sub['Lyrics'].values)
lyrics_str,len(lyrics_str)

 502268)

In [29]:
# Splitting the string into sentences, while converting whole data into lowercase.

corpus = lyrics_str.lower().split("\n")
len(corpus)

16859

In [31]:
corpus[:10]

["i'm dreaming of a white christmas",
 'just like the ones i used to know',
 'where the treetops glisten',
 'and children listen',
 'to hear sleigh bells in the snow',
 '',
 "i'm dreaming of a white christmas",
 'with every christmas card i write',
 'may your days be merry and bright',
 'and may all your christmases be white']

In [32]:
# To make sure no sentence appears twice in our corpus, we use set. Otherwise, it will make the model biased.

corpus = list(set(corpus))
len(corpus)

11282

In [33]:
corpus[:10]

['',
 'now did you think that i was somebody?',
 'join in any reindeer games.',
 "on my own i'll make do with none",
 'then waits for the echo...',
 'you lust for gold with your sharpened knives',
 "it's one on one from las palmas to the golden gate",
 'she starts to go nuts',
 "the barn was buried 'neath a mile of mud",
 'no matter how i try to convince myself,']

In [34]:
# remove empty string

corpus = [string for string in corpus if string !='']
len(corpus)

11281

In [35]:
corpus[:10]

['now did you think that i was somebody?',
 'join in any reindeer games.',
 "on my own i'll make do with none",
 'then waits for the echo...',
 'you lust for gold with your sharpened knives',
 "it's one on one from las palmas to the golden gate",
 'she starts to go nuts',
 "the barn was buried 'neath a mile of mud",
 'no matter how i try to convince myself,',
 '"well, maybe you shouldn\'t call me no more than, baby."']

### Convert lyrics into integers

In [36]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [37]:
total_words

6407

In [40]:
tokenizer.word_index['everyday']

1394

### Input Sequences

Input sequence is the numerical representation of how our words are arranged. 

In [42]:
# create input sequences using list of tokens

inp_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        inp_sequences.append(n_gram_sequence)

In [43]:
inp_sequences[:15]

[[30, 163],
 [30, 163, 2],
 [30, 163, 2, 97],
 [30, 163, 2, 97, 14],
 [30, 163, 2, 97, 14, 3],
 [30, 163, 2, 97, 14, 3, 35],
 [30, 163, 2, 97, 14, 3, 35, 364],
 [1281, 8],
 [1281, 8, 318],
 [1281, 8, 318, 852],
 [1281, 8, 318, 852, 1282],
 [12, 10],
 [12, 10, 248],
 [12, 10, 248, 64],
 [12, 10, 248, 64, 90]]

### Pad Sequences

In [44]:
from keras.preprocessing.sequence import pad_sequences

max_sequence_len = max([len(x) for x in inp_sequences])
inp_sequences = np.array(pad_sequences(inp_sequences,
                       maxlen = max_sequence_len, padding='pre'))

In [45]:
max_sequence_len

279

In [46]:
print(inp_sequences.shape)
inp_sequences

(66742, 279)


array([[  0,   0,   0, ...,   0,  30, 163],
       [  0,   0,   0, ...,  30, 163,   2],
       [  0,   0,   0, ..., 163,   2,  97],
       ...,
       [  0,   0,   0, ...,   0, 656, 217],
       [  0,   0,   0, ..., 656, 217, 217],
       [  0,   0,   0, ..., 217, 217, 217]], dtype=int32)

### Predictions and Labels

we will use our input sequence and use the last word of all sequences as labels for all previous words

In [47]:
predictors, label = inp_sequences[:,:-1],inp_sequences[:,-1]

In [48]:
import keras.utils as ku

label = ku.to_categorical(label, num_classes=total_words)

In [49]:
label.shape

(66742, 6407)

## Build the Model

In [52]:
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model01 = Sequential()
model01.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
# Add a LSTM Layer
model01.add(LSTM(100)) 
model01.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model01.add(Dense(total_words, activation='softmax'))
model01.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  

model01.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 278, 50)           320350    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_2 (Dense)              (None, 3203)              323503    
_________________________________________________________________
dense_3 (Dense)              (None, 6407)              20528028  
Total params: 21,232,281
Trainable params: 21,232,281
Non-trainable params: 0
_________________________________________________________________


In [51]:
his01 = model01.fit(predictors, label, epochs= 10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Save the model

In [53]:
!pip install -q pyyaml h5py

model01.save('rock_split01.h5')

### Lyrics Generation

In [54]:
def make_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        
        predicted = model01.predict_classes(token_list, verbose=0)

        output_word = ""
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

In [78]:
make_lyrics("look at", 50)



look at anderson anderson held teach mend born watching woke venetian napper spot forgotten forgotten lashes lashes mtv mtv souls' unplug unplug 30 romance lifeline air health oak solace row barkin' company hypnotised lessons company company company souls' lexus lexus lexus forgotten microphone wore leather forgotten lashes lashes ris' spot assemble votes


In [81]:
make_lyrics("cold", 100)



cold anderson anderson held quite happy hook quite happy redwood riders lashes lashes i'd hall tasting worse sights lettered treating lettered static static bogus plated canister canister canister hahahahaha charting charting canister canister charting canister canister canister screamin' charting canister canister charting canister canister charting canister canister charting canister canister canister mignon charting charting canister canister charting canister canister canister charting canister canister canister screamin' charting canister canister charting canister canister charting canister canister charting canister canister canister mignon charting charting canister canister charting canister canister canister charting canister canister canister screamin' charting canister canister charting canister canister charting canister canister


In [80]:
make_lyrics("somebody else",100)



somebody else zeppelin lettered lettered beatin' sunlight canister canister canister canister screamin' strictly hahahahaha charting charting canister canister charting canister canister charting canister canister canister screamin' charting canister charting canister canister charting canister canister charting canister canister canister charting canister canister mignon charting canister charting canister canister charting canister canister canister charting canister canister canister mignon charting charting canister canister charting canister canister canister charting canister canister canister screamin' charting canister canister charting canister canister charting canister canister canister charting canister canister mignon charting canister charting canister canister charting canister canister canister charting canister canister canister mignon charting charting canister canister charting


In [82]:
make_lyrics("honey", 100)



honey quite quite happy comfort hook quite quite happy riders supply plus plus capital zeppelin lettered kerosene 20thfloor underground demons underneath it'd ones spat romance lifeline racing fix canister hahahahaha strictly hahahahaha dragged racing might isles patience wonder embarrassed naughty mignon forget forgotten pyres pyres lashes lashes lashes spot spot mtv votes mortal mortal pricked mortal pricked ride' ride' ride' backtalk ride' pearls towel pearls hijiki chin offering offering truth's nuclear nuclear beatin' sunlight romance lifeline racing nuclear business romance king's solace charting way drinking loss solid consciousness tokay elope treat treat eternity screamed travels bonafide impulses pacific juan crystal lessons


In [83]:
make_lyrics("like rain", 100)



like rain anderson anderson held born quite happy redwood riders looking riders solace might world's spot woke stitches lettered stomachlimped victoria vengence vengence victoria venetian stitches street's whats whats might vibes vengence vengence friday protected world's world's venetian strayed forgotten boulevard forgotten lashes lashes mtv spot spot forgotten lashes mtv votes hall solid spot mess woke trusty following trusty ris' banshee's banshee's won woke hall canister waste spot forgotten forgotten lashes lashes mtv mtv votes hall solid spot spot mess woke trusty ulysses it'd travels kings consciousness why'know treat everybody's king's won wraps fall treating canister canister treating lettered canister canister screamin'


In [84]:
make_lyrics("do you", 100)



do you nail anderson spot lettered spot ink mortal mortal grade grade ice strayed strayed strayed penthouse forgotten confess forgotten nicer folded mignon stumbled mignon solace pussy pyres pyres sounding beatin' everybodys backtalk canister boom boom it'd riders solid riders solace row treating thieves thieves spat spot lettered vallance vallance usually means bmi chalk flip canister victoria treating supressing ink screamin' victoria screamin' lettered lashes beatin' means 20thfloor criticize weeps making zero forgotten lashes mtv mtv votes hall solid spot spot mess woke trusty trusty mend romance ink during steals face steals scenes doin' psycho shouting savage left nuclear movin sights betrayals


## Build the Model

In [66]:
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model02 = Sequential()
model02.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
# Add a LSTM Layer
model02.add(LSTM(100)) 
model02.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model02.add(Dense(total_words, activation='softmax'))
model02.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  

model02.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 278, 50)           320350    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_4 (Dense)              (None, 3203)              323503    
_________________________________________________________________
dense_5 (Dense)              (None, 6407)              20528028  
Total params: 21,232,281
Trainable params: 21,232,281
Non-trainable params: 0
_________________________________________________________________


In [67]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='loss', mode='min', verbose=1)
his02 = model02.fit(predictors, label, epochs= 30, callbacks=[es])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Save the model

In [68]:
!pip install -q pyyaml h5py

model02.save('rock_es01.h5')

### Lyrics Generation

In [69]:
def rock2_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        
        predicted = model02.predict_classes(token_list, verbose=0)

        output_word = ""
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

In [70]:
rock2_lyrics("look at", 7)



look at us but do not touch in your


In [71]:
rock2_lyrics("look at", 50)



look at us but do not touch in your room in my hand nothing seems to be mine you know where i'm at a different of you and you're been a queen but your things are like a tree of hate is the tree gone so many different pathways meet you down


In [72]:
rock2_lyrics("honey", 7)



honey you may now honey you may spend


In [73]:
rock2_lyrics("honey", 100)



honey you may now honey you may spend up all my money tonight lord i know that i'm in desperation i'm in the middle of a bad love storm and it's bad as it can be i look around all i can see is water coming down over me i've been down one time and i've been down two times but right now i'm drowning drowning in the sea of love sea of love but right now i'm just drowning drowning in the sea of love a sea of love sea of love i hate my life for the air i


In [74]:
rock2_lyrics("go out", 50)



go out times and said there's a last thing you came to be alone as night later on mine it was so oh baby please i'll go on you can't go on without you said i'll try don't know why what to do without you can't move on can't move on it'd


In [75]:
rock2_lyrics("somebody else",50)



somebody else to stick a dead body inside of me oh oh she was at her feet and she was looking down they let lisa go blind but everyone she knew thought she was beautiful only slightly mental beautiful a bit temperamental beautiful only slightly mental beautiful she thought it would be


In [76]:
rock2_lyrics("act like", 100)



act like a war machine when he calls her again throw it out and her baby at a blaze of night baby i depended and just and i'll be myself if i can't happy happy as grown ups let's pretend that we're kids once again so put on your high heel slippers like your mother would never ever let you wear let's pretend our little girl is your old ragged doll with pretty long blond curly hair a love like ours so sweet and true should never never ever have to end if we can't be happy as grown ups let's pretend


In [77]:
rock2_lyrics("feel like", 100)



feel like a fool yeah like a dancing fool yeah my stain fellows beat she was someone shoes in her free government land she thought for you will be around some more than just in can't dance out of worlds he'll hang around everyone and make it through it again he was to school where a perfumed candle glowed guru screamin' through the night she was very last night to be very young alright baby just my shootin' and i'll be a blue friend and it just up my name set them throttle if only it really before the train grow on


## More Layers with Early Stopping

In [None]:
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model05 = Sequential()
model05.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
# Add a LSTM Layer
model05.add(LSTM(128, return_sequences=True)) 
# A dropout layer for regularisation
model05.add(Dropout(0.2))
# Add another LSTM Layer
model05.add(LSTM(100)) 
model05.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model05.add(Dense(total_words, activation='softmax'))
model05.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  

model05.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 499, 50)           318900    
_________________________________________________________________
lstm_5 (LSTM)                (None, 499, 128)          91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 499, 128)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense_8 (Dense)              (None, 3189)              322089    
_________________________________________________________________
dense_9 (Dense)              (None, 6378)              20345820  
Total params: 21,170,057
Trainable params: 21,170,057
Non-trainable params: 0
__________________________________________

In [None]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='loss', mode='min', verbose=1)
his05 = model05.fit(predictors, label, epochs= 10, callbacks=[es])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Save the model

In [None]:
!pip install -q pyyaml h5py

model05.save('rock_2layer.h5')

### Lyrics Generation

In [None]:
def multilayer_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        
        predicted = model05.predict_classes(token_list, verbose=0)

        output_word = ""
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

In [None]:
multilayer_lyrics("look at", 50)



look at the moonlight and i am the same and i am a slave i dont know what to do i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am


In [None]:
multilayer_lyrics("hold on the feeling", 50)



hold on the feeling and the stars is turning round and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on and on


In [None]:
multilayer_lyrics("act like summer",50)



act like summer of the sky and i know i am the luckiest i am the luckiest the best with a fuck you aint no love in the heart of my life and i am the war i can be a vicious young man oh i am a vicious young man oh i


In [None]:
multilayer_lyrics("walk like rain",50)



walk like rain i am a slave i dont know what to do i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am i am


# Statistical Model

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

texts = []
for s in corpus:
    texts.append(word_tokenize(s))
print(texts[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['its', 'a', 'simple', 'song', 'for', 'simple', 'feeling', 'you', 'see', 'the', 'moon', 'and', 'watch', 'it', 'rise', 'across', 'the', 'continent', 'the', 'night', 'bird', 'sings', 'and', 'somewhere', 'someone', 'hears', 'its', 'cry', 'so', 'disillusioned', 'keep', 'your', 'head', 'down', 'if', 'you', 'do', 'theyll', 'never', 'know', 'youll', 'have', 'no', 'answers', 'to', 'their', 'questions', 'and', 'they', 'will', 'have', 'to', 'let', 'you', 'go', 'and', 'disenfranchised', 'revolution', 'theyll', 'take', 'away', 'by', 'right', 'whats', 'yours', 'and', 'make', 'you', 'martyrs', 'of', 'your', 'own', 'cause', 'when', 'they', 'dont', 'know', 'what', 'cause', 'its', 'for', 'and', 'all', 'deserted', 'stand', 'alerted', 'theyll', 'love', 'you', 'when', 'youre', 'all', 'alone', 'but', 'you', 'find', 'a', 'red', 'rose', 'in', 'the', 'morning', 'light', 'you', 'wait', 'the', 'night', 'and

In [None]:
from nltk.util import ngrams

print(list(ngrams(texts[0], n=1)))
print(list(ngrams(texts[0], n=2)))
print(list(ngrams(texts[0], n=3)))

[('its',), ('a',), ('simple',), ('song',), ('for',), ('simple',), ('feeling',), ('you',), ('see',), ('the',), ('moon',), ('and',), ('watch',), ('it',), ('rise',), ('across',), ('the',), ('continent',), ('the',), ('night',), ('bird',), ('sings',), ('and',), ('somewhere',), ('someone',), ('hears',), ('its',), ('cry',), ('so',), ('disillusioned',), ('keep',), ('your',), ('head',), ('down',), ('if',), ('you',), ('do',), ('theyll',), ('never',), ('know',), ('youll',), ('have',), ('no',), ('answers',), ('to',), ('their',), ('questions',), ('and',), ('they',), ('will',), ('have',), ('to',), ('let',), ('you',), ('go',), ('and',), ('disenfranchised',), ('revolution',), ('theyll',), ('take',), ('away',), ('by',), ('right',), ('whats',), ('yours',), ('and',), ('make',), ('you',), ('martyrs',), ('of',), ('your',), ('own',), ('cause',), ('when',), ('they',), ('dont',), ('know',), ('what',), ('cause',), ('its',), ('for',), ('and',), ('all',), ('deserted',), ('stand',), ('alerted',), ('theyll',), ('l

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline

train, vocab = padded_everygram_pipeline(3, texts)

In [None]:
from nltk.lm import MLE

ng_model = MLE(3) 
ng_model.fit(train, vocab)


In [None]:
print(ng_model.counts['the'])
print(ng_model.counts['are'])
print(ng_model.counts['love'])
print(ng_model.counts['oh'])

print(ng_model.score('the'))
print(ng_model.score('are'))
print(ng_model.score('love'))
print(ng_model.score('oh'))

print(ng_model.counts[['they']]['are'])
print(ng_model.counts[['are']]['they'])

3613
302
534
369
0.03706857635326466
0.003098452825542742
0.005478721221323922
0.003785857922600238
13
2


In [None]:
text_list = ng_model.generate(15, random_seed=2)
print(' '.join(word for word in text_list))

you who broke my heart some people do not mess with mister inbetween no dont


In [None]:
text_list = ng_model.generate(50, random_seed=2)
print(' '.join(word for word in text_list))

you who broke my heart some people do not mess with mister inbetween no dont you worry baby im buying if youre my daughter youre my love ill get it back better late than never i saw you saying that you take my woman where she wants to do it


In [None]:
text_list = ng_model.generate(50, random_seed=1)
print(' '.join(word for word in text_list))

boy who was hanging his head low more trophies and ideas to follow footprints in the darkest place that cow wrote that im doing it all a big joke whatever it is like porcelain yeah </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>
