In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.util import ngrams
from collections import Counter
import re
from nltk.probability import ConditionalFreqDist
import random
import nltk

Setting the parameters for lyrics generation

In [2]:
#set n for ngram sampling (n should be between 2 and 5)
n = 4
#set num_sample to sample from num_sample most likely words
num_sample = 20
#set seed: should be n-1 words long
seed = 'you are the'
#genre of music you want: 'R&B','Pop','Jazz','Hip-Hop','Folk','Metal', 'Indie', 'Electronic','Country' 
genre = 'Pop'
#number of words needed if the final lyrics
num_lyrics = 200

Creation of training data from the precompiled cleaned lyrics.

In [3]:
data = open('./data/'+genre+'.txt', 'r').read()
train_words = data.split(" ")

We create ngrams based on the choosen value on n set above. 

In [4]:
if n==2:
    grams = [(train_words[i],train_words[i+1]) for i in range(len(train_words)-1)]
if n==3:
    grams = [(train_words[i],train_words[i+1], train_words[i+2]) for i in range(len(train_words)-2)]
if n==4:
    grams = [(train_words[i],train_words[i+1],train_words[i+2],train_words[i+3]) for i in range(len(train_words)-3)]
if n==5:
    grams = [(train_words[i],train_words[i+1],train_words[i+2],train_words[i+3], train_words[i+4]) for i in range(len(train_words)-4)]

We want a count of the number of occurances of each word following the determined set for words. Depending of the value of n choosen, we run the appropriate function to get the count of the number of ngrams.

In [5]:
def countFreqBigrams(s):
    ngrams_temp = []
    for (w1,w2) in grams:
        if w1 == s[0]:
            ngrams_temp.append((w1,w2))
    return Counter(ngrams_temp)
def countFreqTrigrams(s):
    ngrams_temp = []
    for (w1,w2,w3) in grams:
        if w1 == s[0] and w2 == s[1] :
            ngrams_temp.append((w1,w2,w3))
    return Counter(ngrams_temp)
def countFreq4grams(s):
    ngrams_temp = []
    for (w1,w2,w3,w4) in grams:
        if w1 == s[0] and w2 == s[1] and w3 == s[2] :
            ngrams_temp.append((w1,w2,w3,w4))
    return Counter(ngrams_temp)
def countFreq5grams(s):
    ngrams_temp = []
    for (w1,w2,w3,w4,w5) in grams:
        if w1 == s[0] and w2 == s[1] and w3 == s[2] and w4 == s[3]:
            ngrams_temp.append((w1,w2,w3,w4,w5))
    return Counter(ngrams_temp)
        

We want to sample among the likely possibilities of next words to choose one randomly for text generation. We select the 'num_sample' most common words and sample them accouding to their probability of occurance.

In [6]:
def getprobsBigrams(c,w0):
    list_count = c.most_common(num_sample)
    list_vals = []
    list_probs = []
    sum_cnt = 0
    for ((w0[0],w1),cnt) in list_count:
        list_vals.append(w1)
        list_probs.append(cnt)
        sum_cnt += cnt
    for i in range(len(list_probs)):
        list_probs[i]/=sum_cnt
    return list_vals,list_probs

def getprobsTrigrams(c,w0):
    list_count = c.most_common(num_sample)
    list_vals = []
    list_probs = []
    sum_cnt = 0
    for ((w0[0],w0[1],w1),cnt) in list_count:
        list_vals.append(w1)
        list_probs.append(cnt)
        sum_cnt += cnt
    for i in range(len(list_probs)):
        list_probs[i]/=sum_cnt
    return list_vals,list_probs

def getprobs4grams(c,w0):
    list_count = c.most_common(num_sample)
    list_vals = []
    list_probs = []
    sum_cnt = 0
    for ((w0[0],w0[1],w0[2],w1),cnt) in list_count:
        list_vals.append(w1)
        list_probs.append(cnt)
        sum_cnt += cnt
    for i in range(len(list_probs)):
        list_probs[i]/=sum_cnt
    return list_vals,list_probs

def getprobs5grams(c,w0):
    list_count = c.most_common(num_sample)
    list_vals = []
    list_probs = []
    sum_cnt = 0
    for ((w0[0],w0[1],w0[2],w0[3],w1),cnt) in list_count:
        list_vals.append(w1)
        list_probs.append(cnt)
        sum_cnt += cnt
    for i in range(len(list_probs)):
        list_probs[i]/=sum_cnt
    return list_vals,list_probs

We generate the number of required words from the model.

In [7]:
sentence = [seed]
w0 = seed.split()
for i in range(num_lyrics):
    if n==2:
        c = countFreqBigrams(w0)
        list_vals,list_probs = getprobsBigrams(c,w0)
    if n==3:
        c = countFreqTrigrams(w0)
        list_vals,list_probs = getprobsTrigrams(c,w0)
    if n==4:
        c = countFreq4grams(w0)
        list_vals,list_probs = getprobs4grams(c,w0)
    if n==5:
        c = countFreq5grams(w0)
        list_vals,list_probs = getprobs5grams(c,w0)
    w1 = np.random.choice(list_vals,p=list_probs)
    sentence.append(w1)
    w0.pop(0)
    w0.append(w1)


Formatting the output..

In [8]:
para = ''
for i in sentence:
    if i=='<eos>'and para[-1]!="\n":
        para+=".\n"
    elif i=='<eos>' or i == 'chorus':
        pass
    else:
        if para == '' or para[-1]=="\n":
            i = i.capitalize()
        if i=='i':
            i.capitalize()
        para = para+ " " +i 
print(para)

 You are the air that we call friends.
 In his touch.
 I gotta keep these cheeks dry today.
 Gotta be a better way to part.
 Nikki its you you you.
 I dont want to live without your love.
 I feel so sad.
 Its too much pressure on me.
 I know youre mine pride and joy.
 He sang like a beautiful girl.
 Thats my prerogative.
 Its my life.
 And im so glad were through.
 Now i have you here today for a bit more ductile like me.
 Cause i dont want to fall on the floor and shake that ass.
 They gon hear you outside from the outside.
 But im hurtin while im with you.
 With you.
 And now youre gonna lie.
 Sometimes when im mad you dont wanna play with mine.
 And when you go.
 I need your loving arms again.
 And its the mystery of love.
 When the going gets rough.
 Love isnt on your daily agenda.
 No matter what it takes then let it go let it go let it all go were gonna


This section has been used to preprocess the lyrics dataest available on
'https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics'.
The lyrics have been divided into categories and cleaned and saved in text files for training.

In [9]:
# songs = pd.read_csv('lyrics.csv')
# for g in list(set(songs["genre"])):
#     print(g)
#     song = songs[songs['genre']==g]
    
#     file = open('./data/'+g+'.txt', 'w')
#     for data in song['lyrics']:
#         try:
#             #data = open('english_text.txt', 'r').read()
#             data= data.lower()
#             lyrics = ''
#             kk = data.replace("\n"," <eos> ")
#             train_words = kk.split(" ")
#             for i in range (len(train_words)):
#                 train_words[i] = train_words[i].translate ({ord(c): '' for c in "!@#$%^&*()[]{};:,./1234567890?\\|`~-=_+-'"})
#             while '' in train_words:
#                 train_words.remove('')
#             for i in range (len(train_words)):
#                 lyrics = lyrics + train_words[i] + ' '
#             file.write(lyrics)
#         except AttributeError:
#             pass

#     file.close()
    