# Create a corpus
- Create and output a charset we are interested in.
- Convert data and output a subset we are interested in.

In [1]:
import csv
import numpy as np
import pandas as pd
from random import sample

In [2]:
# hyper parameters
max_char_n = 20
n_examples = 100000

In [3]:
# output
output_dir = 'data/'
charset_file = '{}charset.csv'.format(output_dir)
dataset_file = '{}dataset.csv'.format(output_dir)

## Functions

In [4]:
# Transforms a csv into an array of song lyrics (None, 1)
def csvToSongLyricsArray(csv):
    # filter for only lyrics from the 1990s, of the pop genre, and not instrumentals
    mask = (csv['year'] > 1989) & (csv['year'] < 2000) & (csv['genre'] == 'Pop') & (csv['lyrics'] != '[Instrumental]')
    filtered = csv[mask]
    
    # remove null values
    nonNull = filtered.dropna()
    
    # trim all the extra data. We only want the lyrics
    lyrics = nonNull['lyrics']
    
    # reindex the lyrics to make it easier to work with
    reindexed = lyrics.reset_index(drop=True)
    
    # lowercase the lyrics
    lowercased = reindexed[:].str.lower()
    
    # get the number of song lyrics
    n_songs = lowercased.shape[0]
    
    return lowercased, n_songs

In [5]:
# filter out any song where lyrics contain a character outside the chars set
def filterLyrics(charset, lyrics):
    filtered_lyrics = []
    
    # for each song
    for lyric in lyrics:
        check = 0
        
        # split the lyric into an array of characters
        lyric_chars = list(lyric)
        
        # for each character, check if it's not in the chars set
        for char in lyric_chars:
            if char not in charset:
                check = 1

        # if all character are in the chars set
        # add it to our filter lyrics list
        if check == 0:
            filtered_lyrics.append(lyric_chars)
            
    # get the number of lyrics
    n_filtered_lyrics = len(filtered_lyrics)    
    
    return filtered_lyrics, n_filtered_lyrics

In [6]:
# flatten the previous into a list of song lyrics lines
def flatten_lyrics(lyrics):
    flattened_lyrics = [line for song in lyrics for line in song]
    n_chars = len(flattened_lyrics)
    
    return flattened_lyrics, n_chars

In [7]:
def generateDataset(lyrics, n_chars, n_examples, max_char_n):
    dataset = []
    max_index = n_chars - max_char_n
    start_indices = np.random.randint(0, max_index, size=n_examples)

    for start_index in start_indices:
        end_index = start_index + max_char_n
        example = lyrics[start_index:end_index]
        start_index = end_index
        dataset.append(example)
        
    return dataset

## Extract and Transform Raw Dataset

In [8]:
# load raw data file as a dataframe
raw_data = pd.read_csv('data/raw.csv')

In [9]:
# get formatted_lyrics and number of songs
lyrics, n_lyrics = csvToSongLyricsArray(raw_data)

In [10]:
lyrics.head(10)

0    come they told me, pa rum pum pum pum\na new b...
1    over the ground lies a mantle, white\na heaven...
2    i just came back from a lovely trip along the ...
3    i'm dreaming of a white christmas\njust like t...
4    just hear those sleigh bells jingle-ing, ring-...
5    little rump shaker she can really shake and ba...
6    girl you want to sex me\ngirl, why don't you l...
7    oooh, tonight i want to turn the lights down l...
8    so you say he let you on, you'll never give yo...
9    something about you baby\nthat makes me wanna ...
Name: lyrics, dtype: object

In [11]:
# examine the number of song lyrics we have
print("Number of lyrics: {}".format(n_lyrics))
print("Lyric Example: {}".format(lyrics[0]))

Number of lyrics: 964
Lyric Example: come they told me, pa rum pum pum pum
a new born king to see, pa rum pum pum pum
our finest gifts we bring, pa rum pum pum pum
to lay before the king, pa rum pum pum pum,
rum pum pum pum, rum pum pum pum,
so to honor him, pa rum pum pum pum,
when we come.
little baby, pa rum pum pum pum
i am a poor boy too, pa rum pum pum pum
i have no gift to bring, pa rum pum pum pum
that's fit to give the king, pa rum pum pum pum,
rum pum pum pum, rum pum pum pum,
shall i play for you, pa rum pum pum pum,
on my drum?
mary nodded, pa rum pum pum pum
the ox and lamb kept time, pa rum pum pum pum
i played my drum for him, pa rum pum pum pum,
rum pum pum pum, rum pum pum pum,
then he smiled at me, pa rum pum pum pum
me and my drum
me and my drum
me and my drum
me and my drum
come they told me, pa rum pum pum pum
me and my drum


## Filter out non-english lyrics

In [12]:
charset = ["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '\n', '!', '"', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' ']

In [13]:
# determine number of charecters in our set
n_charset = len(charset)

print("Number of characters in chars: {}".format(n_charset))

Number of characters in chars: 60


In [14]:
# filter out any song where lyrics contain a character outside the english set
filtered_lyrics, n_filtered_lyrics = filterLyrics(charset, lyrics)

print("Number of english songs: {}".format(n_filtered_lyrics))
print("A english song lyric: {}".format(filtered_lyrics[0]))

Number of english songs: 782
A english song lyric: ['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'a', ' ', 'n', 'e', 'w', ' ', 'b', 'o', 'r', 'n', ' ', 'k', 'i', 'n', 'g', ' ', 't', 'o', ' ', 's', 'e', 'e', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'o', 'u', 'r', ' ', 'f', 'i', 'n', 'e', 's', 't', ' ', 'g', 'i', 'f', 't', 's', ' ', 'w', 'e', ' ', 'b', 'r', 'i', 'n', 'g', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 't', 'o', ' ', 'l', 'a', 'y', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 't', 'h', 'e', ' ', 'k', 'i', 'n', 'g', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ',', '\n', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ',', ' ', 'r',

In [15]:
# flatten english song lyrics
flattened_lyrics, n_chars = flatten_lyrics(filtered_lyrics)

print("Number of song lyrics characters: {}".format(n_chars))
print("Section of song lyrics: {}".format(flattened_lyrics[0:100]))

Number of song lyrics characters: 846434
Section of song lyrics: ['c', 'o', 'm', 'e', ' ', 't', 'h', 'e', 'y', ' ', 't', 'o', 'l', 'd', ' ', 'm', 'e', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'a', ' ', 'n', 'e', 'w', ' ', 'b', 'o', 'r', 'n', ' ', 'k', 'i', 'n', 'g', ' ', 't', 'o', ' ', 's', 'e', 'e', ',', ' ', 'p', 'a', ' ', 'r', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', ' ', 'p', 'u', 'm', '\n', 'o', 'u', 'r', ' ', 'f', 'i', 'n', 'e', 's', 't', ' ', 'g', 'i', 'f', 't', 's', ' ', 'w', 'e']


## Extract the subset we are interested in

In [16]:
# generate n_examples example of max_char_n length
dataset = generateDataset(flattened_lyrics, n_chars, n_examples, max_char_n)

print("Number of examples in dataset: {}".format(len(dataset)))
print("Example: {}".format(dataset[0]))

Number of examples in dataset: 100000
Example: ['t', 'e', 'r', 'd', 'a', 'y', "'", 's', ' ', 'g', 'i', 'r', 'l', ',', '\n', 'h', 'e', ' ', 'i', 's']


## Export datasets

In [17]:
# save charset
with open(charset_file, 'w', newline='') as csvFile:
    file = csv.writer(csvFile, delimiter=',')
    file.writerows(charset)

In [18]:
# save dataset
with open(dataset_file, 'w', newline='') as csvFile:
    file = csv.writer(csvFile, delimiter=',')
    file.writerows(dataset)