In [38]:
def tokenize_lyrics(lyrics):
    lyrics = lyrics.replace(' ', '_')
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r"[a-z]|\ |\n'|'")
    return ['_' if x == ' ' else x for x in tokenizer.tokenize(lyrics.lower())]

In [39]:
ascii_values = [ord('\n'), ord('_'), ord("'")] + list(range(ord('a'), ord('z') + 1))
chars = list(map(chr, ascii_values))
indices = list(range(len(ascii_values)))
char_to_index = dict(zip(chars, indices))
index_to_char = dict(zip(indices, chars))

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('data/Refined_Dataset.csv')

In [5]:
len(data)

238215

In [33]:
def generate_sentences(tokenized_lyrics, max_len=20, step=1):
    sentences = []
    next_char = []
    for i in range(0, len(tokenized_lyrics) - max_len, step):
        sentences.append(tokenized_lyrics[i:i + max_len])
        next_char.append(tokenized_lyrics[i + max_len])
        
    return sentences, next_char

In [47]:
def vectorize_sentences(sentences, next_char, max_len=20):
    x = np.zeros((len(sentences), max_len, len(chars)), dtype=np.int32)
    y = np.zeros((len(sentences), len(chars)), dtype=np.int32)

    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_to_index[char]] = 1
        y[i, char_to_index[next_char[i]]] = 1
        
    return x,y

In [14]:
def process_lyrics(lyrics):
    tokenized_lyrics = tokenize_lyrics(lyrics)
    sentences, next_char = generate_sentences(tokenized_lyrics)
    x, y = vectorize_sentences(sentences, next_char)
    return x,y

In [10]:
data.loc[0].lyrics

"Oh baby, how you doing?\nYou know I'm gonna cut right to the chase\nSome women were made but me, myself\nI like to think that I was created for a special purpose\nYou know, what's more special than you? You feel me\nIt's on baby, let's get lost\nYou don't need to call into work 'cause you're the boss\nFor real, want you to show me how you feel\nI consider myself lucky, that's a big deal\nWhy? Well, you got the key to my heart\nBut you ain't gonna need it, I'd rather you open up my body\nAnd show me secrets, you didn't know was inside\nNo need for me to lie\nIt's too big, it's too wide\nIt's too strong, it won't fit\nIt's too much, it's too tough\nHe talk like this 'cause he can back it up\nHe got a big ego, such a huge ego\nI love his big ego, it's too much\nHe walk like this 'cause he can back it up\nUsually I'm humble, right now I don't choose\nYou can leave with me or you could have the blues\nSome call it arrogant, I call it confident\nYou decide when you find on what I'm working 

In [17]:
tokenized_lyrics = tokenize_lyrics(data.loc[0].lyrics)

In [48]:
x, y = process_lyrics(data.loc[0].lyrics)

In [50]:
x

array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

In [34]:
sentences, next_char = generate_sentences(tokenized_lyrics)

In [35]:
sentences

[['o',
  'h',
  '_',
  'b',
  'a',
  'b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n'],
 ['h',
  '_',
  'b',
  'a',
  'b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g'],
 ['_',
  'b',
  'a',
  'b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g',
  'y'],
 ['b',
  'a',
  'b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g',
  'y',
  'o'],
 ['a',
  'b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g',
  'y',
  'o',
  'u'],
 ['b',
  'y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g',
  'y',
  'o',
  'u',
  '_'],
 ['y',
  '_',
  'h',
  'o',
  'w',
  '_',
  'y',
  'o',
  'u',
  '_',
  'd',
  'o',
  'i',
  'n',
  'g',
  'y',
  'o',
  'u',
  '_',
  'k'],
 ['_',
  'h',

In [36]:
next_char

['g',
 'y',
 'o',
 'u',
 '_',
 'k',
 'n',
 'o',
 'w',
 '_',
 'i',
 "'",
 'm',
 '_',
 'g',
 'o',
 'n',
 'n',
 'a',
 '_',
 'c',
 'u',
 't',
 '_',
 'r',
 'i',
 'g',
 'h',
 't',
 '_',
 't',
 'o',
 '_',
 't',
 'h',
 'e',
 '_',
 'c',
 'h',
 'a',
 's',
 'e',
 's',
 'o',
 'm',
 'e',
 '_',
 'w',
 'o',
 'm',
 'e',
 'n',
 '_',
 'w',
 'e',
 'r',
 'e',
 '_',
 'm',
 'a',
 'd',
 'e',
 '_',
 'b',
 'u',
 't',
 '_',
 'm',
 'e',
 '_',
 'm',
 'y',
 's',
 'e',
 'l',
 'f',
 'i',
 '_',
 'l',
 'i',
 'k',
 'e',
 '_',
 't',
 'o',
 '_',
 't',
 'h',
 'i',
 'n',
 'k',
 '_',
 't',
 'h',
 'a',
 't',
 '_',
 'i',
 '_',
 'w',
 'a',
 's',
 '_',
 'c',
 'r',
 'e',
 'a',
 't',
 'e',
 'd',
 '_',
 'f',
 'o',
 'r',
 '_',
 'a',
 '_',
 's',
 'p',
 'e',
 'c',
 'i',
 'a',
 'l',
 '_',
 'p',
 'u',
 'r',
 'p',
 'o',
 's',
 'e',
 'y',
 'o',
 'u',
 '_',
 'k',
 'n',
 'o',
 'w',
 '_',
 'w',
 'h',
 'a',
 't',
 "'",
 's',
 '_',
 'm',
 'o',
 'r',
 'e',
 '_',
 's',
 'p',
 'e',
 'c',
 'i',
 'a',
 'l',
 '_',
 't',
 'h',
 'a',
 'n',
 '_',
 'y'

In [22]:
len(tokenized_lyrics)

1874