<div align="left">
  <h1>LSTM</h1> <a name="0-bullet"></a>
</div>


---

# Text Generation
Adapted from code at https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

In [None]:
'''Example script to generate text from sample text.

At least 20 epochs are required before the generated text
starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from keras.models import Sequential
from keras.callbacks import LambdaCallback, EarlyStopping
from keras.layers import Dense, Activation, Embedding, Dropout, Bidirectional
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import random
import sys
import re

## Data loading

In [None]:
# mounting the Drive folder

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# loading the lyric dataset by taking english songs only
lyrics = pd.read_csv('/content/drive/MyDrive/Text_Analytics/Data/lyrics-data.csv')
lyrics = lyrics[lyrics['Idiom']=='ENGLISH']

In [None]:
#Only keep popular artists, with genre Rock/Pop and popularity high enough
artists = pd.read_csv('/content/drive/MyDrive/Text_Analytics/Data/artists-data.csv')

artists = artists[(artists['Genre'].isin(['Rock'])) & (artists['Popularity']>5)]

In [None]:
# merge of the datasets

df = lyrics.merge(artists[['Artist', 'Genre', 'Link']], left_on='ALink', right_on='Link', how='inner')

In [None]:
# dropping all the columns that we will not use for this task

df = df.drop(columns=['ALink','SLink','Idiom','Link'])

In [None]:
# taking only the lyrics with less than 350 chars, this will simple the fine-tuning phase

df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [None]:
df.head()

Unnamed: 0,SName,Lyric,Artist,Genre
0,What's Up,Twenty-five years and my life is still. Trying...,4 Non Blondes,Rock
1,Spaceman,Starry night bring me down. Till I realize the...,4 Non Blondes,Rock
2,Pleasantly Blue,Every time you wake in the mornin'. And you st...,4 Non Blondes,Rock
3,Train,What ya gonna do child. When your thoughts are...,4 Non Blondes,Rock
4,Calling All The People,"How can you tell, when your wellness is not we...",4 Non Blondes,Rock


In [None]:
# in order to train the model for the text generation we need to create a unique file by separating each song with a start and end label
SOT = "<SOT>"     # start of text
EOT = "<EOT>"     # end of text
df_lyrics = df['Lyric'].apply(lambda lyrics: SOT + lyrics + EOT)

In [None]:
df_lyrics[0]

"<SOT>Twenty-five years and my life is still. Trying to get up that great big hill of hope. For a destination.. I realized quickly when I knew I should. That the world was made up of this. brotherhood of man. For whatever that means. And so I cry sometimes. When I'm lying in bed Just to get it all out. What's in my head. And I, I am feeling a little peculiar.. And so I wake up in the morning. And I step outside. And I take a deep breath and I get real high. And I scream from the top of my lungs. What's going on?. And I say: Hey! yeah yeaaah, Hey yeah yea. I said hey, what's going on?. And I say: Hey! yeah yeaaah, Hey yeah yea. I said hey, what's going on?. ooh, ooh ooh. and I try, oh my god do I try. I try all the time, in this institution. And I pray, oh my god do I pray. I pray every single day. For a revolution.. And so I cry sometimes. When I'm lying bed. Just to get it all out. What's in my head. And I, I am feeling a little peculiar. And so I wake up in the morning. And I step ou

In [None]:
# then each song will be separeted by a new line, this will let the model understand better the structure of the text
text = '\n\n\n'.join(df_lyrics.values)

## Text vectorization

In [None]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

total chars: 168


In [None]:
# drastically cut the number of sequences only to allow a fast execution during lesson
sentences = sentences[:50000]
next_chars = next_chars[:50000]

In [None]:
sentences[3]

'ty-five years and my life is still. Tryi'

In [None]:
next_chars[3]

'n'

In [None]:
print('training sequences:', len(sentences))
print('vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.int)
y = np.zeros((len(sentences), len(chars)), dtype=np.int)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('done.')

training sequences: 50000
vectorization...
done.


In [None]:
x[3],y[3]

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

## Neural Network

In [None]:
# model = Sequential()
# model.add(LSTM(256, input_shape=(maxlen, len(chars))))
# model.add(Dropout(0.2))
# model.add(Dense(len(chars), activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [None]:
# this is the model that will be used for the training, here there is also a Bideractional layer that is more useful for large text sequences
model = Sequential()
model.add(Bidirectional(LSTM(256), input_shape=(maxlen, len(chars))))
model.add(Dropout(0.1))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 512)               870400    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 168)               86184     
Total params: 956,584
Trainable params: 956,584
Non-trainable params: 0
_________________________________________________________________


## Functions to generate text

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('----- temperature:', temperature)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        print('Seed:')
        print(sentence)
        print('------')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char
            generated += next_char

        print('Generated text:')
        print(generated)
        print('------')
        print()

## Training the model

In [None]:
# train the model, output generated text after each epoch
model.fit(x, y,
          batch_size=64,
          epochs=10,
          validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5af944f350>

## Comparing ends of the lyrics

In [None]:
# testing the tet generation by applying different values of temperature
start_index = random.randint(0, len(text) - maxlen - 1)
for temperature in [0.2, 0.5, 0.8, 1.0, 1.2]:
    print('----- temperature:', temperature)

    generated = ''
    sentence = text[start_index: start_index + maxlen]
    print('Seed:')
    print(sentence)
    print('------')

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = indices_char[next_index]

        sentence = sentence[1:] + next_char
        generated += next_char
    generated = generated.split('.')

    print('Generated text:')
    print('\n'.join(generated))
    print('------')
    print()

----- temperature: 0.2
Seed:
ut if there's a pill to help me forget. 
------
Generated text:
She to down to roll shack
 You know I know that's got the skind
 And I way I sake you roll
 I wan way I gonna baby
 And I heads of the good to be
 I got you do I do I do lood down down war hog on my baby
 And I got you wanna good time
 Baby blease boogie boogie
 And I take you got to reall
 Give it up
 I to she's gonna get it up
 Give it up
 All screwed up
 It's a broin shake, brain shake, brain s
------

----- temperature: 0.5
Seed:
ut if there's a pill to help me forget. 
------
Generated text:
Ceall the sard the bous
 And I take you nothing dorn my spous
 If you want it up the good on me
 Hear the por chame to me
 I got a tound ats me the go down
 Thook your man't go
 She said hey hell
 I got a might to show
 I wann ay songin' all screttoute
 I want up it to balky someth me soig fire
 Thore you want stack for me
 I'm gonna gove 't up the fight to hell me way to ready
 Are you ready for 
----

## Songs similarities

In [None]:
# generation of the text with a title taken by an original song from the dataset

generated = ''
sentence = "You've Got to Hide Your Love Away"
print('Seed:')
print(sentence)
print('------')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 1.0)
    next_char = indices_char[next_index]

    sentence = sentence[1:] + next_char
    generated += next_char
generated = generated.split('.')

print('Generated text:')
print('\n'.join(generated))
print('------')
print()

Seed:
You've Got to Hide Your Love Away
------
Generated text:
ehHhi--r rr Ul sy
,Eh(hie
eeee m,ss
e
s( 
<ejcyme sr
ra!uiK(aatsaaeeeeueehhseYttaoeaeiiyietWuhirtxwdm
i

tihlh
a-
S hieea yii r
erStnm
 bii 
o
< Ri-i[riyouaiiii-oaoaooowu[oheheiernitioaiieoauaooowLwh sis)uehhhu iieoa 
iouiio"
o c
  rl 
l,keTI  aeshraimihotuealiouioaottse
Mck  Hnrrruuuuoh
 essuu sajywisteOkml Iohhes hbiewiouoaooueu
!]h epeycehmllh<eehehy  ?ihh o
eh rr-!ath[
, Ksr eideeeuwuhht--iumh
------



In [None]:
# this is a program used to calculate the similarity between the generated lyrics with the original ones

lyrics = np.concatenate([generated, df['Lyric'].values]) # we need to put together the generated text and the original lyrics

tfidf = TfidfVectorizer(stop_words="english").fit_transform(lyrics) # vectorization process of the previous values

pairwise_similarity = tfidf * tfidf.T # similarity matrix calculation 

pairwise_similarity = pairwise_similarity.toarray()[0] # converting it to an array
    
pairwise_similarity[0] = -1 # mask the diagonal element (the similarity to itself)
   
most_similar_idxs = pairwise_similarity.argsort()[-4:][::-1] # get the top 3 most similar lyrics to the generated lyrics
most_similar_idxs =  most_similar_idxs[1:]

output = [', '.join(df.iloc[most_similar_idxs - 1].SName),  # generate as output the title of the 3 most similar songs and their scores
              *pairwise_similarity[most_similar_idxs], 
              most_similar_idxs - 1]

print("similar to: {}\n- scores: {}, {}, {}".format(*output))

similar to: Teacher's Pet, Suzi (Wants Her All Day What?), Sunrise
- scores: 0.0, 0.0, 0.0


In [None]:
# generation of the text with a title taken by an original song from the dataset

generated = ''
sentence = "The Wait"
print('Seed:')
print(sentence)
print('------')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 1.0)
    next_char = indices_char[next_index]

    sentence = sentence[1:] + next_char
    generated += next_char
generated = generated.split('.')

print('Generated text:')
print('\n'.join(generated))
print('------')
print()

Seed:
The Wait
------
Generated text:
dyihu]
ohhIcothiBnhTstiuhthmhlmh<hr!htttol-tcooataaSNhhuhhuldiha
icncl<bwbruS
 Sl!B
 feit Suhoo,thehotleBntouw-[uh
 uauihhhc?sculfdioirhmxh
ea,eehehhBchmhS<ahi aeiBda"houh-hhoaheshmehluthtuhew <DiecdaBn heiuwtomhavtlahhhuehu 

9oeTiocwToooihiolTouawkeh ho-CShi]FcScl o'o
!waW
m,]hh

h
auwhhiuot'waaaa ytonhlwa czth-tPwmhiuhul
hwOatwwwhholo-lh hhkhtoaHw
eShh
ohcwuw<hihhWlhowaob-z
ytui,lhhtmhhhl
5srn)
------



In [None]:
# this is a program used to calculate the similarity between the generated lyrics with the original ones

lyrics = np.concatenate([generated, df['Lyric'].values]) # we need to put together the generated text and the original lyrics

tfidf = TfidfVectorizer(stop_words="english").fit_transform(lyrics) # vectorization process of the previous values

pairwise_similarity = tfidf * tfidf.T  # similarity matrix calculation 

pairwise_similarity = pairwise_similarity.toarray()[0] # converting it to an array
    
pairwise_similarity[0] = -1 # mask the diagonal element (the similarity to itself)
    
most_similar_idxs = pairwise_similarity.argsort()[-4:][::-1] # get the top 3 most similar lyrics to the generated lyrics
most_similar_idxs =  most_similar_idxs[1:]

output = [', '.join(df.iloc[most_similar_idxs - 1].SName), # generate as output the title of the 3 most similar songs and their scores
              *pairwise_similarity[most_similar_idxs], 
              most_similar_idxs - 1]

print("similar to: {}\n- scores: {}, {}, {}".format(*output))

similar to: Monica, Star, Song For Love
- scores: 0.0, 0.0, 0.0


## Generating text with different languages

Here we will test our model with different languages input seeds, in particular by using Italian, Spanish and French

In [None]:
generated = ''
sentence = "Siamo fuori di testa, ma diversi da loro"
print('Seed:')
print(sentence)
print('------')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 1.0)
    next_char = indices_char[next_index]

    sentence = sentence[1:] + next_char
    generated += next_char
generated = generated.split('.')

print('Generated text:')
print('\n'.join(generated))
print('------')
print()

Seed:
Siamo fuori di testa, ma diversi da loro
------
Generated text:

 She llow I hall you natht in the way
 [Cow I'm a ready dirth beed ir tise
 Lot's rond We'll great fertevel on meerany
 For my homes
 for there amboting ready
 Bor my held os ferlot ald heave a but elack
 For my bad wigh a flaprid why

 Wo baidy oft of atw that tarn dan'
 I'm be tameem tel that you not
 Sur tam
 ald the poold reall for the plite
 Hor I wit a lood tomd, wat's 'Cause I't feel whe o
------



In [None]:
generated = ''
sentence = "Sí, sabes que ya llevo un rato mirándote"
print('Seed:')
print(sentence)
print('------')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 1.0)
    next_char = indices_char[next_index]

    sentence = sentence[1:] + next_char
    generated += next_char
generated = generated.split('.')

print('Generated text:')
print('\n'.join(generated))
print('------')
print()

Seed:
Sí, sabes que ya llevo un rato mirándote
------
Generated text:
?
 A rock 't' rall

 Never might ic ut
 So down, dy my hey strow
 Anl oo cid to for off ay the gaig
 Coull me back in byoude
 Soven'm in the sky
 Blow Yeah you'll bee's rain my plare
 Ah yeahiye
 Shave ballshake, brain shake
 beais heall-me
 The e7win' bet ap¦stanmanfy starl
 I'm Shot bead whor you thing rock
 'n' I'm gonta down on the plowr<EOT>


<SOT>Hell ichere dee ssrock
 Reahe hone won't for
------



In [None]:
generated = ''
sentence = "La vie c'est plus marrant"
print('Seed:')
print(sentence)
print('------')

for i in range(400):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 1.0)
    next_char = indices_char[next_index]

    sentence = sentence[1:] + next_char
    generated += next_char
generated = generated.split('.')

print('Generated text:')
print('\n'.join(generated))
print('------')
print()

Seed:
La vie c'est plus marrant
------
Generated text:
hii)ciSteaSChoieiB
 
hhlwAiciiomehheecrorniiwtwiethcoaRi)OOoh-hCaoa woaoioi
iiihisihu'h<uh-aiwBigyeehhhhhstlheelbyyhluheltoycitimci<u oOcwoowhblhcoooaohetjni(nochninHichii
ieilasuveii uuautaoirycOu[hehhh
ih
ahnpnuih
ha<< hf2kNhchhs(iahee
a>-
)shr<gYilhhc°tiiiiehhseOsWhsieHhhaleliir
rweic
eah huooeu
aarnoocrIEseeehciCDsoiooeewhhhmu
fhfslnntnniriirahOm
hsuhhhoOfzci
hwtiiiior,e hhi(viwtntii
9uhS
hmdk
------

