# Create a corpus

## Extract and Transform Raw Dataset

In [1]:
import pandas as pd

In [2]:
# load raw data file as a dataframe
raw_data = pd.read_csv('data/raw.csv')

In [3]:
# filter for only lyrics from the 1990s, of the pop genre, and not instrumentals
mask = (raw_data['year'] > 1989) & (raw_data['year'] < 2000) & (raw_data['genre'] == 'Pop') & (raw_data['lyrics'] != '[Instrumental]')
filtered_data = raw_data[mask]

In [4]:
# remove any that have null values
cleaned_data = filtered_data.dropna()

In [5]:
# trim all the extra data. We only want the lyrics
raw_lyrics = cleaned_data['lyrics']

In [6]:
# reindex the lyrics to make it easier to work with
reindexed_lyrics = raw_lyrics.reset_index(drop=True)

In [7]:
# lowercase the lyrics to make it easier to work with
formatted_lyrics = reindexed_lyrics[:].str.lower()
formatted_lyrics.head(10)

0    come they told me, pa rum pum pum pum\na new b...
1    over the ground lies a mantle, white\na heaven...
2    i just came back from a lovely trip along the ...
3    i'm dreaming of a white christmas\njust like t...
4    just hear those sleigh bells jingle-ing, ring-...
5    little rump shaker she can really shake and ba...
6    girl you want to sex me\ngirl, why don't you l...
7    oooh, tonight i want to turn the lights down l...
8    so you say he let you on, you'll never give yo...
9    something about you baby\nthat makes me wanna ...
Name: lyrics, dtype: object

In [8]:
# examine the number of song lyrics we have
n_formatted_lyrics = formatted_lyrics.shape[0]
print(n_formatted_lyrics)

964


In [9]:
# split each lyric on \n
# store song lyrics as a list of lines
# store those in lyrics
lyrics_lines = []

for i in range(n_formatted_lyrics):
    lyrics = formatted_lyrics[i].split('\n')
    lyrics_lines.append(lyrics)

In [10]:
# flatten the previous into a list of song lyrics lines
flattened_lyrics_lines = [line for song in lyrics_lines for line in song]

In [11]:
# examine the resulting number of song lyrics lines we have
print(len(flattened_lyrics_lines))
print(flattened_lyrics_lines[0])

35188
come they told me, pa rum pum pum pum


## Filter out non-english lyrics

In [12]:
# char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
char_set = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z']

In [13]:
from keras.preprocessing.text import text_to_word_sequence

english_lyrics_lines = []

for line in flattened_lyrics_lines:
    line_split = text_to_word_sequence(line, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
    char_split = list(" ".join(line_split))
    char_check = 0
    for char in char_split:
        if char not in char_set:
            char_check = 1
            
    if char_check == 0:
        english_lyrics_lines.append("".join(char_split))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [14]:
# examine the resulting number of song lyrics lines we have
print(len(english_lyrics_lines))
print(english_lyrics_lines[0])

33460
come they told me pa rum pum pum pum


In [15]:
# combine all lines
corpus = ""

for line in english_lyrics_lines:
    corpus += line + "\n"
    
# examine the resulting corpus
print(corpus)

come they told me pa rum pum pum pum
a new born king to see pa rum pum pum pum
our finest gifts we bring pa rum pum pum pum
to lay before the king pa rum pum pum pum
rum pum pum pum rum pum pum pum
so to honor him pa rum pum pum pum
when we come
little baby pa rum pum pum pum
i am a poor boy too pa rum pum pum pum
i have no gift to bring pa rum pum pum pum
that's fit to give the king pa rum pum pum pum
rum pum pum pum rum pum pum pum
shall i play for you pa rum pum pum pum
on my drum
mary nodded pa rum pum pum pum
the ox and lamb kept time pa rum pum pum pum
i played my drum for him pa rum pum pum pum
rum pum pum pum rum pum pum pum
then he smiled at me pa rum pum pum pum
me and my drum
me and my drum
me and my drum
me and my drum
come they told me pa rum pum pum pum
me and my drum
over the ground lies a mantle white
a heaven of diamonds shine down through the night
two hearts are thrilling
in spite of the chilling weather
sleigh bells ring are you listening
in the lane snow is glisten

In [17]:
# save it to a file
with open('data/corpus.txt', 'w', newline='') as file:
   file.write(corpus)
   file.close()