## Download data

Data from: https://www.kaggle.com/gyani95/380000-lyrics-from-metrolyrics/version/2

In [None]:
import requests
from zipfile import ZipFile
download_link = "https://lyrics-generator1.s3.eu-central-1.amazonaws.com/380000-lyrics-from-metrolyrics.zip"
zip_file = "download/lyrics.zip"

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(download_link, zip_file)

# Create a ZipFile Object and load sample.zip in it
with ZipFile(zip_file, 'r') as zipObj:
   # Extract all the contents of zip file in different directory
   zipObj.extractall('download/')
    
# delete zip file after extracting the data
os.remove(zip_file)

### Imports and paths

In [18]:
import pandas as pd
import os
import string
download_path = "download/"
lyrics_csv = os.path.join(download_path, "lyrics.csv")

In [19]:
data_frame = pd.read_csv(lyrics_csv, encoding="utf8", sep=",")
data_frame.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [20]:
data_frame["genre"].value_counts(normalize=True)[:20]

Rock             0.362682
Pop              0.136496
Hip-Hop          0.093765
Not Available    0.082305
Metal            0.078424
Other            0.065380
Country          0.047720
Jazz             0.047336
Electronic       0.044736
R&B              0.016384
Indie            0.015824
Folk             0.008947
Name: genre, dtype: float64

In [21]:
data_frame["artist"].value_counts()[:50]

dolly-parton                755
american-idol               700
elton-john                  680
b-b-king                    667
chris-brown                 655
eddy-arnold                 628
barbra-streisand            624
ella-fitzgerald             623
bob-dylan                   614
david-bowie                 599
bee-gees                    599
dean-martin                 593
eminem                      589
celine-dion                 551
frank-zappa                 550
bruce-springsteen           534
elvis-costello              534
eric-clapton                522
beach-boys                  520
bill-anderson               473
chicago                     461
frank-sinatra               435
beatles                     429
chamillionaire              428
50-cent                     425
chumbawamba                 423
britney-spears              422
diana-ross                  420
emmylou-harris              419
cedarmont-kids              417
bon-jovi                    407
fall    

### Filter data frame

We want to only look at songs from the pop genre, with dates between 1990 - 2020.

In [22]:
top_genres = 3
genres = [value for value in data_frame["genre"].value_counts()[:top_genres].to_dict().keys()]
print(genres, len(genres))

top_artists_per_genre = 10
artists = []
for genre in genres:
    artists.extend([key for key in data_frame[data_frame.genre==genre]["artist"].value_counts()[:top_artists_per_genre].to_dict().keys()])
artists, len(artists)

['Rock', 'Pop', 'Hip-Hop'] 3


(['elton-john',
  'b-b-king',
  'bob-dylan',
  'david-bowie',
  'frank-zappa',
  'elvis-costello',
  'bruce-springsteen',
  'eric-clapton',
  'beach-boys',
  'chicago',
  'american-idol',
  'barbra-streisand',
  'bee-gees',
  'celine-dion',
  'britney-spears',
  'diana-ross',
  'disney',
  'gary-numan',
  'akon',
  'dusty-springfield',
  'chris-brown',
  'eminem',
  'chamillionaire',
  '50-cent',
  'drake',
  '2pac',
  'game',
  'e-40',
  'chief-keef',
  'busta-rhymes'],
 30)

In [23]:
if "index" in data_frame:
    del data_frame["index"]

# filter genre and rtist
data_frame = data_frame[data_frame.genre.isin(genres)]
data_frame = data_frame[data_frame.artist.isin(artists)]


# filter year
start_year = 1970
end_year = 2020
mask = (data_frame['year'] > start_year) & (data_frame['year'] <= end_year)
data_frame = data_frame.loc[mask]

# filter nan lyrics, artists and genres
data_frame = data_frame[data_frame.lyrics.notna()]
data_frame = data_frame[data_frame.genre.notna()]
data_frame = data_frame[data_frame.artist.notna()]

data_frame.info(), data_frame.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13197 entries, 8636 to 354460
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   song    13197 non-null  object
 1   year    13197 non-null  int64 
 2   artist  13197 non-null  object
 3   genre   13197 non-null  object
 4   lyrics  13197 non-null  object
dtypes: int64(1), object(4)
memory usage: 618.6+ KB


(None,
                             song  year   artist    genre  \
 8636              the-invitation  2009  50-cent  Hip-Hop   
 8637  better-come-on-your-a-game  2009  50-cent  Hip-Hop   
 8638               longon-girl-2  2009  50-cent  Hip-Hop   
 8639                   hard-rock  2009  50-cent  Hip-Hop   
 8640               i-line-niggas  2009  50-cent  Hip-Hop   
 
                                                  lyrics  
 8636  I had five hundred grams in fifty-fifty-eight ...  
 8637  [Chorus: 50 Cent]\nNigga try and come play me\...  
 8638  {DAMN!}\n[Intro: {DJ Whoo KidDJ Whoo Kid}]\nUh...  
 8639  [Ester Dean]\nHere we go ready hard rock\nHere...  
 8640  [Intro]\nI'm tryin to tell you but you won't h...  )

### Prepare Data

Do following transformations:
 - replace hypens in song with spaces
 - replace hypens in artist with spaces
 - song texts in chunks of 32 characters

In [24]:
data_frame["song"] = data_frame["song"].str.replace("-", " ")
data_frame["artist"] = data_frame["artist"].str.replace("-", " ")
list(data_frame["artist"].unique())[:100]

['50 cent',
 'game',
 'e 40',
 'eric clapton',
 'bruce springsteen',
 'david bowie',
 'chris brown',
 '2pac',
 'britney spears',
 'disney',
 'bee gees',
 'bob dylan',
 'chief keef',
 'b b king',
 'celine dion',
 'drake',
 'dusty springfield',
 'gary numan',
 'elvis costello',
 'chicago',
 'chamillionaire',
 'barbra streisand',
 'american idol',
 'akon',
 'eminem',
 'beach boys',
 'elton john',
 'frank zappa',
 'busta rhymes',
 'diana ross']

In [32]:
new_data_frame = pd.DataFrame(columns=data_frame.columns)
save_path = os.path.join("data/", "preprocessed_lyrics.csv")
new_data_frame.to_csv(save_path, mode="w+", encoding="utf8", sep=",", index=False)

# define string filter
def filter_characters(lyrics_text):
    allowed_chars = string.whitespace + string.ascii_letters + string.digits + "\"$%&'()*,-:;\n\r"
    lyrics_text = ''.join(c for c in lyrics_text if c in allowed_chars)
    return lyrics_text

# save data every 100 rows
save_interval = 10
chunk_size = 32
for i, row in data_frame.iterrows():
    lyrics_text = filter_characters(row["lyrics"])
    lyrics_text = '<start> ' + row["lyrics"] + ' <end>'
    chunks = [lyrics_text[i:i+chunk_size] for i in range(0, len(lyrics_text), chunk_size)]
    
    for chunk in chunks:
        new_data_frame = new_data_frame.append({"song": row["song"], "year": row["year"], "artist": row["artist"], "genre": row["genre"], "lyrics": chunk}, ignore_index=True)
        
    if i % save_interval == 0 and i > 0:
        new_data_frame.to_csv(save_path, mode="a", encoding="utf8", sep=",", index=False, header=False)
        new_data_frame = new_data_frame.iloc[0:0]


### Get number of artists and genres and characters

In [None]:
genre_count = len(new_data_frame["genre"].unique())
artist_count = len(new_data_frame["artist"].unique())

vocab_count = set()
for row in new_data_frame["lyrics"]:
    if type(row) == str:
        for char in row:
            vocab_count.add(char)
genre_count, artist_count, len(vocab_count)