## Download data

Data from: https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/version/2

In [None]:
import requests
from zipfile import ZipFile
download_link = "https://lyrics-generator1.s3.eu-central-1.amazonaws.com/380000-lyrics-from-metrolyrics.zip"
zip_file = "data/lyrics.zip"

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(download_link, zip_file)

# Create a ZipFile Object and load sample.zip in it
with ZipFile(zip_file, 'r') as zipObj:
   # Extract all the contents of zip file in different directory
   zipObj.extractall('data/')
    
# delete zip file after extracting the data
os.remove(zip_file)

### Imports and paths

In [1]:
import pandas as pd
import os
from tqdm import tqdm

lyrics_csv = os.path.join("data/", "lyrics.csv")

In [2]:
data_frame = pd.read_csv(lyrics_csv, encoding="utf8", sep=",")
data_frame.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [3]:
data_frame["genre"].unique()

array(['Pop', 'Hip-Hop', 'Not Available', 'Other', 'Rock', 'Metal',
       'Country', 'Jazz', 'Electronic', 'Folk', 'R&B', 'Indie'],
      dtype=object)

In [4]:
data_frame["artist"].unique()[:100]

array(['beyonce-knowles', 'eazy-e', 'asher-monroe', 'dick-mace',
       'codigo-fn', 'bossacucanova', 'divine-destiny', 'antwon',
       'alice-on-the-roof', 'daliah-lavi', 'doug-keith', 'borialis',
       'demonaz', 'banda-carnaval', 'dave-sterling',
       'achim-seifert-project', 'brightwood', 'cenk-r-lr-etin', 'destroy',
       'gene-watson', 'bobby-charles', 'a-dream-too-late', 'farben-lehre',
       'chloe-alesha', 'deichkind', 'fang', 'christi-warner',
       '65daysofstatic', 'anna-waronker', 'the-books', 'the-contents-are',
       'alathea', 'charly-rodriguez', 'fahrenheit-43', 'graveyard',
       'chant', 'dorival-caymmi', 'droop-e', 'arcade-fire',
       'atsumi-saori', 'funkadelic', 'arthur-big-boy-crudup',
       'boy-wonder', 'chuckie', 'brian-simpson', 'audio-b5',
       'carl-broemel', 'found', 'asia-nitollano', 'doug-hream-blunt',
       'children-18-3', 'arsha', 'evan-taubenfeld', 'exit-this-side',
       'brandi-carlile', 'echoterra', 'cfcf', 'detroit-marcella', 'b3'

### Filter data frame

We want to only look at songs from the pop genre, with dates between 1990 - 2020.

In [5]:
if "index" in data_frame:
    del data_frame["index"]

# filter genre
genres = ['Pop', 'Rock']
data_frame = data_frame[data_frame.genre.isin(genres)]

# filter artist
artists = ['elton-john', 'beatles']
data_frame = data_frame[data_frame.artist.isin(artists)]

# filter year
start_year = 1970
end_year = 2020
mask = (data_frame['year'] > start_year) & (data_frame['year'] <= end_year)
data_frame = data_frame.loc[mask]

# filter nan lyrics, artists and genres
data_frame = data_frame[data_frame.lyrics.notna()]
data_frame = data_frame[data_frame.genre.notna()]
data_frame = data_frame[data_frame.artist.notna()]

data_frame.info(), data_frame.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1092 entries, 149250 to 329172
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   song    1092 non-null   object
 1   year    1092 non-null   int64 
 2   artist  1092 non-null   object
 3   genre   1092 non-null   object
 4   lyrics  1092 non-null   object
dtypes: int64(1), object(4)
memory usage: 51.2+ KB


(None,
                        song  year   artist genre  \
 149250             michelle  2009  beatles  Rock   
 149251    what-goes-on-girl  2009  beatles  Rock   
 149252    run-for-your-life  2009  beatles  Rock   
 149253  if-i-needed-someone  2009  beatles  Rock   
 149254     you-won-t-see-me  2009  beatles  Rock   
 
                                                    lyrics  
 149250  Michelle, ma belle\nThese are words that go to...  
 149251  I've got a word or two\nTo say about the thing...  
 149252  Well I'd rather see you dead, little girl\nTha...  
 149253  If I needed someone to love\nYou're the one th...  
 149254  When I call you up your line's engaged.\nI hav...  )

### Prepare Data

Do following transformations:
 - replace hypens in song with spaces
 - replace hypens in artist with spaces
 - song texts in chunks of 32 characters

In [6]:
data_frame["song"] = data_frame["song"].str.replace("-", " ")
data_frame["artist"] = data_frame["artist"].str.replace("-", " ")
list(data_frame["artist"].unique())[:100]

['beatles', 'elton john']

In [7]:
new_data_frame = pd.DataFrame(columns=data_frame.columns)
print(new_data_frame.info())
chunk_size = 64
for i, row in data_frame.iterrows():
    lyrics_text = row["lyrics"]
    chunks = [lyrics_text[i:i+chunk_size] for i in range(0, len(lyrics_text), chunk_size)]
    
    for chunk in chunks:
        new_data_frame = new_data_frame.append({"song": row["song"], "year": row["year"], "artist": row["artist"], "genre": row["genre"], "lyrics": chunk}, ignore_index=True)

data_frame = new_data_frame
data_frame.head()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   song    0 non-null      object
 1   year    0 non-null      object
 2   artist  0 non-null      object
 3   genre   0 non-null      object
 4   lyrics  0 non-null      object
dtypes: object(5)
memory usage: 0.0+ bytes
None


Unnamed: 0,song,year,artist,genre,lyrics
0,michelle,2009,beatles,Rock,"Michelle, ma belle\nThese are words that go to..."
1,michelle,2009,beatles,Rock,"elle\nMichelle, ma belle\nSont les mots qui vo..."
2,michelle,2009,beatles,Rock,"e\nTres bien ensemble\nI love you, I love you,..."
3,michelle,2009,beatles,Rock,ll I want to say\nUntil I find a way\nI will s...
4,michelle,2009,beatles,Rock,"know that you'll understand\nMichelle, ma bell..."


### Get number of artists and genres and characters

In [8]:
genre_count = len(data_frame["genre"].unique())
artist_count = len(data_frame["artist"].unique())

vocab_count = set()
for row in data_frame["lyrics"]:
    if type(row) == str:
        for char in row:
            vocab_count.add(char)
genre_count, artist_count, len(vocab_count)

(1, 2, 96)

In [9]:
artists = list(data_frame["artist"].unique())
artists.sort(reverse=False)
artist_string = " - ".join([artist.replace("-"," ") + os.linesep for artist in artists])
with open("data/artists.txt", "w+", encoding="utf8", newline='') as txt_file:
    txt_file.write(artist_string)

### Save data

In [10]:
save_path = os.path.join("data/", "preprocessed_lyrics.csv")
data_frame.to_csv(save_path, encoding="utf8", sep=",", index=False)