In [1]:
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random 
import sys
import io
import requests
import re
import json

import nltk
from collections import Counter, defaultdict
from random import randint

Using TensorFlow backend.


In [2]:
text = {};
#devtext = {}; development
testtext = {};
nameList = ["Drake","Queen","Iron Maiden","Eminem"];

# Opening JSON file 
f = open('data/lyricdataii.json',) 

# returns JSON object as  
# a dictionary 

data = json.load(f)
'''for i in range(int(len(data[name])*6/10)):
    text += data[name][i];
    
for i in range(int(len(data[name])*6/10)+1,len(data[name])-1):
    devtext += data[name][i];'''

for name in nameList:
    text[name] = "";
    for i in range(len(data[name])-1):
        text[name] += data[name][i];

    testtext[name] = data[name][len(data[name])-1]
    
f.close()

N-Gram Model Experiment

In [3]:
# cleans the given string and returns it as a list of words, lower-cased
def sent_transform(sent_string):
    return nltk.word_tokenize(sent_string.lower())

# words is a list of words
def make_ngram_tuples(words, n):
    for i in range(0,n):
        words.insert(0, '<s>')
    words.append('</s>')
    result = []
    x = range(n, len(words))
    for i in x:
        context = tuple(words[i-n+1:i])
        sequence = (context,) + (words[i],)
        result.append(sequence)
    return result

class RandomModel(object):
    def __init__(self, song_list, name):
        self.name = name
        self.d = Counter() # keep track of # of words
        for song in song_list:
            sentences = nltk.sent_tokenize(song)
            for sentence in sentences:
                list_words = sent_transform(sentence)
                for word in list_words:
                    self.d[word] += 1
        

class BigramModel(object):
    
    def __init__(self, inputfile, name):
        self.name = name
        self.d = Counter() # keep track of # of words
        
        # iterating through the text file and incrementing the count for the words seen
        f = open('data/' + inputfile, 'r')
        text = f.read()
        sentences = nltk.sent_tokenize(text)
        for line in sentences:
            if(line != '\n'):
                list_words = sent_transform(line)
                ngram_sequence = make_ngram_tuples(list_words, 2)
                for e in ngram_sequence:
                    self.d["".join(e[0])] += 1
                    self.d[e[1]] += 1
        
        self.unk_set = set() # set containing words that appear only once
        self.dsum = 0 # the size of the vocabulary
        
        for word in self.d:
            if self.d[word] == 1:
                self.unk_set.add(word)
            else:
                self.dsum += 1
        
        self.d_bigram = defaultdict(lambda: defaultdict(lambda: 0))
        f = open('data/' + inputfile, 'r')
        text = f.read()
        sentences = nltk.sent_tokenize(text)
        for line in sentences:
            list_words = sent_transform(line) 
            ngram_sequence = make_ngram_tuples(list_words, 2)
            for e in ngram_sequence:
                context = "".join(e[0]) if "".join(e[0]) not in self.unk_set else "<UNK>"
                word = e[1] if e[1] not in self.unk_set else "<UNK>"
                self.d_bigram[context][word] += 1
    
    def logprob(self, context, word): 
        if context in self.unk_set:
            context = "<UNK>"
        if word in self.unk_set:
            word = "<UNK>"
        return np.log2(self.d_bigram[context][word]+1)-np.log2(self.d[context]+self.dsum)
    
    def get_ppl(self, testfile):
        log_corpus_prob = 0
        len_corpus = 0
        with open('data/' + testfile) as f:
            for line in f:
                if(line != '\n'):
                    list_words = sent_transform(line)
                    ngram_sequence = make_ngram_tuples(list_words, 2)
                    for e in ngram_sequence:
                        context = "".join(e[0]) if "".join(e[0]) not in self.unk_set else "<UNK>"
                        word = e[1] if e[1] not in self.unk_set else "<UNK>"
                        log_corpus_prob += self.logprob(context, word)
                    len_corpus += len(list_words)
        return 2 ** (-1 * log_corpus_prob/len_corpus)

    
    
# generates n sentences given a random model
def random_text_generator(randomlm, n):
    f = open("data/" + randomlm.name + "-random-generated-song.txt", "w")
    word_count = randomlm.d
    for i in range(0, n):
        cur_sentence_len = 0
        sentence = ""
        # generates sentences of length 10
        sorted_words = sorted(word_count, key=word_count.get)
        len_sentence = random.randint(8, 15)
        while cur_sentence_len <= 9:
            sentence += random.choice(sorted_words[-100:]) + " "
            cur_sentence_len += 1
        f.write(sentence)
        f.write("\n")

# generates n sentences based off of the bigram model
def text_generator(bigramlm, n):
    f = open("data/" + bigramlm.name + "-bigram-generated-song.txt", "w")
    bigram = bigramlm.d_bigram
    for i in range(0, n):
        len_sentence = 0
        cur_word = "<s>"
        sentence = ""
        while len_sentence <= 13:
            sorted_d = sorted((value, key) for (key, value) in bigram[cur_word].items())
            #print(sorted_d)
            if(len(sorted_d) >= 5):
                next_word = sorted_d[len(sorted_d)-randint(1, 5)][1]
            else:
                next_word = sorted_d[randint(0, len(sorted_d)-1)][1]
            if next_word == "</s>":
                break
            cur_word = next_word
            sentence += next_word + " "
            len_sentence += 1
        f.write(sentence)
        f.write("\n")

        
drake_model = RandomModel(data['Drake'], 'drake')
random_text_generator(drake_model, 10)
        
drake_bg_model = BigramModel('drake-songs.txt', 'drake')
text_generator(drake_bg_model, 10)

queen_model = RandomModel(data['Queen'], 'queen')
random_text_generator(queen_model, 10)

queen_bg_model = BigramModel('queen-songs.txt', 'queen')
text_generator(queen_bg_model, 10)

ironmaiden_model = RandomModel(data['Iron Maiden'], 'ironmaiden')
random_text_generator(ironmaiden_model, 10)

ironmaiden_bg_model = BigramModel('ironmaiden-songs.txt', 'ironmaiden')
text_generator(ironmaiden_bg_model, 10)

eminem_model = RandomModel(data['Eminem'], 'eminem')
random_text_generator(eminem_model, 10)

eminem_bg_model = BigramModel('eminem-songs.txt', 'eminem')
text_generator(eminem_bg_model, 10)

In [4]:
print(drake_bg_model.get_ppl('drake-bigram-generated-song.txt'))
print(drake_bg_model.get_ppl('drake-random-generated-song.txt'))
print('-----------------------------------------------')
print(queen_bg_model.get_ppl('queen-bigram-generated-song.txt'))
print(queen_bg_model.get_ppl('queen-random-generated-song.txt'))
print('-----------------------------------------------')
print(ironmaiden_bg_model.get_ppl('ironmaiden-bigram-generated-song.txt'))
print(ironmaiden_bg_model.get_ppl('ironmaiden-random-generated-song.txt'))
print('-----------------------------------------------')
print(eminem_bg_model.get_ppl('eminem-bigram-generated-song.txt'))
print(eminem_bg_model.get_ppl('eminem-random-generated-song.txt'))

136.35836530796567
1050.889190555876
-----------------------------------------------
94.95996887296059
825.1638531454465
-----------------------------------------------
180.71346499838944
500.5128051261902
-----------------------------------------------
151.67755301311522
1591.6845699444023


Neural Network Experiment

In [5]:
processed_text = {}

for name in nameList:
    processed_text[name] = text[name].lower() # turns all the text into lowercase
    processed_text[name] = re.sub(r'[^\x00-\x7f]',r'', processed_text[name]) #only include ascii characters

In [6]:
chars = {}
char_indices = {}
indices_char = {}

for name in nameList:
    chars[name] = sorted(list(set(processed_text[name]))) #gives a sorted list of all the characters that appear in your text
    char_indices[name] = dict((c,i) for i, c in enumerate(chars[name])) # dictionary that's in this form: "character": index of the character
    indices_char[name] = dict((i,c) for i, c in enumerate(chars[name])) # dictionary that's in this form: index: "character"

In [7]:
maxlen = 40
step = 3
sentences = {}
next_chars = {}


for name in nameList:
    sentences[name] = []
    next_chars[name] = []
    
    # the program goes through all the text from left to right, records a string of (maxlen) in sentences, 
    # and the character that comes next in next_chars
    for i in range(0, len(processed_text[name]) - maxlen, step):
        sentences[name].append(processed_text[name][i: i + maxlen])
        next_chars[name].append(processed_text[name][i + maxlen])
    print("nb sequences:", len(sentences[name]))


nb sequences: 102700
nb sequences: 113460
nb sequences: 22222
nb sequences: 214800


In [8]:
x = {}
y = {}

for name in nameList:

    # Now vectorize the data

    x[name] = np.zeros((len(sentences[name]), maxlen, len(chars[name])), dtype=np.bool) # 3D array of len(sentences) x maxlen x len(chars)
    y[name] = np.zeros((len(sentences[name]), len(chars[name])), dtype=np.bool) # 2D array of len(sentences) x len(chars)
    # x and ys are all filled with false at this point

    for i, sentence in enumerate(sentences[name]):
        #i is the index
        #sentence is whatever is in sentences[i]... a fragment of the lyrics 
        for t, char in enumerate(sentence):
            x[name][i, t, char_indices[name][char]] = 1 
            # your turning the string into a giant array. 
            # its like for "a" youd have [true, false, false,... ....false,false] and so on, for every char 
            # in every fragment in sentences 
        y[name][i, char_indices[name][next_chars[name][i]]] = 1
        # and y is just x but for next_chars instead of sentences
    


In [9]:
for name in nameList:
    print(x[name].shape)

(102700, 40, 59)
(113460, 40, 53)
(22222, 40, 50)
(214800, 40, 62)


In [10]:
for name in nameList:
    print(y[name].shape)

(102700, 59)
(113460, 53)
(22222, 50)
(214800, 62)


In [11]:
model = {}

for name in nameList:

    model[name] = Sequential() # ok now create the model
    model[name].add(LSTM(128, input_shape=(maxlen, len(chars[name])))) 
    #add a LSTM layer to the model... input shape is 
    #what kind of array is it expecting, in this case, a maxlen * len(chars) array... aka a vectorize lyric fragment
    #the first number represents units -- in this case there are 128 units 
    model[name].add(Dense(len(chars[name]), activation = 'softmax')) 
    # add a Dense layer to the model, len(chars) units and a softmax activation function

    optimizer = RMSprop(lr = 0.01) #this determines how gradient descent will be carried out to optimize your model

    model[name].compile(loss='categorical_crossentropy',optimizer=optimizer) 
    # sets up the model so it can be trained,
    # first defining a loss function, and then an optimizatio function

In [12]:
for name in nameList:
    print(model[name].summary)

<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x13d583810>>
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x13ee9bc10>>
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x13fc00f90>>
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x13fc42490>>


In [13]:
from collections import defaultdict, Counter
from itertools import islice
import numpy as np
import nltk

def sent_transform(sent_string):
    return nltk.word_tokenize(sent_string.lower())

# words is a list of words
def make_ngram_tuples(words, n):
    for i in range(0,n):
        words.insert(0, '<s>')
    words.append('</s>')
    result = []
    x = range(n, len(words))
    for i in x:
        context = tuple(words[i-n+1:i])
        sequence = (context,) + (words[i],)
        result.append(sequence)
    return result

class BigramModel(object):
    
    def __init__(self, inputfile):
        self.d = Counter() # keep track of # of words
        
        # iterating through the text file and incrementing the count for the words seen
        text = inputfile
        sentences = nltk.sent_tokenize(text)
        for line in sentences:
            if(line != '\n'):
                list_words = sent_transform(line)
                ngram_sequence = make_ngram_tuples(list_words, 2)
                for e in ngram_sequence:
                    self.d["".join(e[0])] += 1
                    self.d[e[1]] += 1
        
        self.unk_set = set() # set containing words that appear only once
        self.dsum = 0 # the size of the vocabulary
        
        for word in self.d:
            if self.d[word] == 1:
                self.unk_set.add(word)
            else:
                self.dsum += 1
        
        self.d_bigram = defaultdict(lambda: defaultdict(lambda: 0))
        text = inputfile
        sentences = nltk.sent_tokenize(text)
        for line in sentences:
            list_words = sent_transform(line) 
            ngram_sequence = make_ngram_tuples(list_words, 2)
            for e in ngram_sequence:
                context = "".join(e[0]) if "".join(e[0]) not in self.unk_set else "<UNK>"
                word = e[1] if e[1] not in self.unk_set else "<UNK>"
                self.d_bigram[context][word] += 1
    
    def logprob(self, context, word): 
        if context in self.unk_set:
            context = "<UNK>"
        if word in self.unk_set:
            word = "<UNK>"
        return np.log2(self.d_bigram[context][word]+1)-np.log2(self.d[context]+self.dsum)
    
    def get_ppl(self, testfile):
        log_corpus_prob = 0
        len_corpus = 0
        f = testfile
        for line in f:
            if(line != '\n'):
                list_words = sent_transform(line)
                ngram_sequence = make_ngram_tuples(list_words, 2)
                for e in ngram_sequence:
                    context = "".join(e[0]) if "".join(e[0]) not in self.unk_set else "<UNK>"
                    word = e[1] if e[1] not in self.unk_set else "<UNK>"
                    log_corpus_prob += self.logprob(context, word)
                len_corpus += len(list_words)
        return 2 ** (-1 * log_corpus_prob/len_corpus)
    
#perplexity_model = BigramModel(devtext) testing on the development set for tuning hyperparameters

perplexity_model = {}
for name in nameList:
    perplexity_model[name] = BigramModel(testtext[name]) # now the data is being tested on the testset

In [14]:
def sample(preds, temperature=1.0): #whichever character has the highest likelyhood is the character that gets printed
    
    # temperature - determines how crazy the program can get
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [15]:
def on_epoch_end(epoch, _):
    
    
    start_index = random.randint(0, len(processed_text[name]) - maxlen - 1)
    #for temperature in [0.2, 0.5, 1.0, 1.2]: testing temperature for the development sets
    temperature = 0.2
    print("------------------temperature:", temperature)

    generated = ''
    sentence = processed_text[name][start_index: start_index + maxlen]
    generated += sentence
    print('------------------generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    # take a random (maxlen) fragment of the lyrics -- this will be your start

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars[name])))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[name][char]] = 1
        # turns the seed into a vector

        preds = model[name].predict(x_pred, verbose=0)[0] # predict the next character 
        next_index = sample(preds, temperature) #get the index of the character
        next_char = indices_char[name][next_index] #get the character

        generated += next_char #add the next character
        sentence = sentence[1:] + next_char # shift the "sentence" that will be used to predict

        sys.stdout.write(next_char)
        sys.stdout.flush()

    print()
    print("perplexity: " , perplexity_model[name].get_ppl(generated)) #get perplexity
    print("")
            

In [None]:
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)


for name in nameList:
    print("Generating Lyrics for:" + name)
    
    model[name].fit(x[name], y[name],
             batch_size=128,
             epochs=4,
             callbacks=[print_callback])

# an epoch is a round of training
# a batch_size is how many samples are used in each epoch, 
# because you don't want to use every single lyric sample during
# training

# and finally, the callback just makes it so the function is
# ran after each epoch.


Source for neural network code: 

Jeff Heaton. 2019. Text Generation with Keras and TensorFlow (10.3). Youtube. https://www.youtube.com/watch?v=6ORnRAz3gnA