# Data Engineering
`Donwloading and creating the dataset for the LyricsGenerator task.`

In [10]:
# installation: 
! pip install lyricsgenius
! pip install pandas
! pip install regex



In [11]:
# imports and setup
import lyricsgenius
import os
import pandas as pd
import regex as re


In [36]:
# file paths
root = './'
kaggleDataset = root + 'datasets/kaggle/'
geniusDataset = root + 'datasets/genius/'
geniusTopRapIMDB = root + 'datasets/geniusTopRapIMDB/'
geniusTopIMDB = root + 'datasets/geniusTopIMDB/'
tokenPath = root + 'geniusToken.txt'


## GeniusLyrics-crawler

Using: 
- the Genius API from genius.com (https://docs.genius.com/#/getting-started-h1)
- the LyricsGenius-Framework (https://github.com/johnwmillr/LyricsGenius)
---

In [13]:
# Genius API Access Token
# get token at: https://genius.com/api-clients
# load token from file
with open(tokenPath, 'r') as file:
    GENIUS_ACCESS_TOKEN = str(file.read())

    # delete last newline character from token
    # GENIUS_ACCESS_TOKEN = GENIUS_ACCESS_TOKEN[:-1]
    file.close()

print(GENIUS_ACCESS_TOKEN)

VwcGmXppVPXEitVnlLnG2SZPDYeJznffxwdU9EtcFe6B7Pshtwtwjqb1ckHjDy9K


In [14]:
# test connection'
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)

In [15]:
# function to save lyrics to files
def writeSongToFile(artist):
    for song in artist.songs:
        # save song in a file called artistname-songname.txt
        song.save_lyrics(filename='Lyrics_' + song.artist + '_' + song.title, extension='txt', verbose=True)


In [16]:
# move all files starting with 'Lyrics_' to '/datasets/genius'
def moveLyricsFiles(fromPath, toPath):
    for filename in os.listdir(fromPath):
        if filename.startswith('Lyrics_'):
            os.rename(filename, toPath + filename)


### test lyricsgenius framework

In [18]:
# test connection with Colplay as example
artist = genius.search_artist("Coldplay", max_songs=2, sort="popularity")
print(artist.songs)

Searching for songs by Coldplay...

Song 1: "Viva la Vida"
Song 2: "The Scientist"

Reached user-specified song limit (2).
Done. Found 2 songs.
[Song(id, artist, ...), Song(id, artist, ...)]


In [19]:
# print lyrics of first song from example
print(artist.songs[0].lyrics)

Translationsहिन्दीTürkçeEspañolPortuguêsDeutschFrançaisViva la Vida Lyrics[Verse 1: Chris Martin]
I used to rule the world
Seas would rise when I gave the word
Now in the morning, I sleep alone
Sweep the streets I used to own

[Interlude]

[Verse 2: Chris Martin]
I used to roll the dice
Feel the fear in my enemies' eyes
Listen as the crowd would sing
"Now the old king is dead, long live the king"
One minute, I held the key
Next, the walls were closed on me
And I discovered that my castles stand
Upon pillars of salt and pillars of sand
[Chorus: Chris Martin]
I hear Jerusalem bells a-ringin'
Roman cavalry choirs are singin'
Be my mirror, my sword and shield
My missionaries in a foreign field
For some reason I can't explain
Once you'd gone, there was never
Never an honest word
And that was when I ruled the world

[Interlude]

[Verse 3: Chris Martin]
It was the wicked and wild wind
Blew down the doors to let me in
Shattered windows and the sound of drums
People couldn't believe what I'd be

In [20]:
# test writing lyrics to file function for example
#writeSongToFile(artist)

In [21]:
# test moving lyrics files to dataset folder for example
#moveLyricsFiles(root, geniusDataset)

### get the top 100 artists
using:
- https://www.kaggle.com/datasets/sashankpillai/spotify-top-200-charts-20202021
- Top 49 20th Century: https://www.imdb.com/list/ls058480497/
- Top 100 AllTime: https://www.imdb.com/list/ls064818015/
- Top Rapper: https://www.imdb.com/list/ls054191097/

In [22]:
# load csv to dataframe
# df = pd.read_csv(root + 'datasets/spotify_dataset.csv')

# list of top musicians from IMDB
df_top49 = pd.read_csv(root + 'datasets/Top100GreatestMusicArtistsofAll-TimeIMDB.csv')
df_top20s = pd.read_csv(root + 'datasets/Top100GreatestMusiciansSincethe20thCenturyIMDB.csv')
df_topRap = pd.read_csv(root + 'datasets/Top100rappersIMDB.csv')

In [23]:
# sort the dataframe by Streams
df_top49 = df_top49.sort_values(by=['Name'], ascending=True)
df_top20s = df_top20s.sort_values(by=['Name'], ascending=True)
df_topRap = df_topRap.sort_values(by=['Name'], ascending=True)
print(len(df_top49))
print(len(df_top20s))
print(len(df_topRap))

49
100
100


In [24]:
df_top20s.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Name,Known For,Birth Date
62,63,nm0009540,2014-05-06,2014-05-06,,AC/DC,Iron Man,
54,55,nm0012613,2014-05-06,2014-05-06,,Aerosmith,Armageddon,
36,37,nm0004840,2014-05-06,2014-05-06,,Alice Cooper,Wayne's World,1948-02-04
97,98,nm1561881,2014-05-06,2014-05-06,,Amy Winehouse,Amy,1983-09-14
79,80,nm0031045,2014-05-06,2014-05-06,,Anthrax,Last Action Hero,


In [25]:
df_top49.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Name,Known For,Birth Date
22,23,nm0009540,2017-04-13,2017-04-13,,AC/DC,Iron Man,
25,26,nm0291349,2017-04-13,2017-04-13,,Aretha Franklin,The Blues Brothers,1942-03-25
47,48,nm0461498,2017-05-10,2017-05-10,,Beyoncé,Dreamgirls,1981-09-04
6,7,nm0001168,2017-04-09,2017-04-09,,Bob Dylan,Renaldo and Clara,1941-05-24
9,10,nm0002490,2017-04-09,2017-04-09,,Bob Marley,I Am Legend,1945-02-06


In [26]:
df_topRap.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Name,Known For,Birth Date
31,32,nm1265067,2014-08-07,2014-08-07,,50 Cent,Get Rich or Die Tryin',1975-07-06
39,40,nm0190246,2014-08-08,2014-08-08,,AZ,Bulworth,
10,11,nm0071275,2014-08-07,2014-08-07,,André 3000,Four Brothers,1975-05-27
54,55,nm0188025,2014-08-08,2014-08-08,,Anthony 'Treach' Criss,Feast,1970-12-02
46,47,nm0797421,2014-08-08,2014-08-08,,Beanie Sigel,State Property,1974-03-06


In [27]:
# get all Names from df_top49 and df_top20s in a list
names_top49 = df_top49['Name'].tolist()
names_top20s = df_top20s['Name'].tolist()

# merge both lists
topArtistsList = names_top49 + names_top20s
topRapper = df_topRap['Name'].tolist()

# remove duplicates
topArtistsList = list(dict.fromkeys(topArtistsList))

In [28]:

# delete all entries from topRapper that are also in topArtistsList
topRapper = [x for x in topRapper if x not in topArtistsList]

topRapper.sort()

topRapper = list(filter(None, topRapper))

# remove whitespaces from start and end of strings
topRapper = [x.strip() for x in topRapper]

print(topRapper)

['50 Cent', 'AZ', 'André 3000', "Anthony 'Treach' Criss", 'Beanie Sigel', 'Big Boi', 'Big Daddy Kane', 'Big Pun', 'Biz Markie', 'Brother Ali', 'Bun B', 'Busta Rhymes', "Cam'ron", 'Canibus', 'Chuck D', 'Common', 'DMX', 'Donald Glover', 'Dr. Dre', 'Drake', 'E-40', 'Eazy-E', 'Eight Ball', 'Fabolous', 'Fat Joe', 'Future', 'Game', 'Ghostface Killah', 'Gift of Gab', 'Gucci Mane', 'Guru', 'Ice Cube', 'Ice-T', 'Immortal Technique', 'J. Cole', 'Jadakiss', 'Jeezy', 'Joe Budden', 'KRS-One', 'Kendrick Lamar', 'Killer Mike', 'Kool G Rap', 'Kool Keith', 'Kool Moe Dee', 'Kurupt', 'LL Cool J', 'Lamont Coleman', 'Lauryn Hill', "Lil' Kim", "Lil' Wayne", 'Lloyd Banks', 'Ludacris', 'Lupe Fiasco', 'Luther Campbell', 'MC Lyte', 'Macklemore', 'Mase', 'Masta Ace', 'Master P', 'Meek Mill', 'Method Man', 'Nelly', 'Nicki Minaj', 'Nipsey Hussle', "Ol' Dirty Bastard", 'One Be Lo', 'Pharoahe Monch', 'Pimp C', 'Posdnuos', 'Prodigy', 'Pusha T', 'Q-Tip', 'Queen Latifah', 'Raekwon', 'Rakim', 'Redman', 'Rick Ross', "Roy

In [29]:
print(topArtistsList)
print(len(topArtistsList))

['AC/DC', 'Aretha Franklin', 'Beyoncé', 'Bob Dylan', 'Bob Marley', 'Bruce Springsteen', 'Chuck Berry', 'David Bowie', 'Diana Ross', 'Eagles', 'Elton John', 'Elvis Presley', 'Eminem', "Guns N' Roses", 'James Brown', 'Jay-Z', 'Jimi Hendrix', 'John Lennon', 'Johnny Cash', 'Joni Mitchell', 'Led Zeppelin', 'Little Richard', 'Madonna', 'Marvin Gaye', 'Michael Jackson', 'Nas', 'Nirvana', 'Parliament Funkadelic', 'Paul McCartney', 'Pink Floyd', 'Prince', 'Public Enemy', 'Queen', 'Ray Charles', 'Stevie Wonder', 'The Beach Boys', 'The Beatles', 'The Clash', 'The Doors', 'The Kinks', 'The Notorious B.I.G.', 'The Rolling Stones', 'The Velvet Underground', 'The Who', 'Tina Turner', 'Tupac Shakur', 'U2', 'Whitney Houston', 'Ye', 'Aerosmith', 'Alice Cooper', 'Amy Winehouse', 'Anthrax', 'B.B. King', 'Beastie Boys', 'Billy Joel', 'Black Sabbath', 'Blondie', 'Bo Diddley', 'Buddy Holly', 'Creedence Clearwater Revival', 'Daft Punk', 'Def Leppard', 'Depeche Mode', 'Earth Wind & Fire', 'Elvis Costello', 'Eu

### generate genius-dataset
using:
- top 100 Artists from streaming numbers extracted from:
    - https://www.kaggle.com/datasets/sashankpillai/spotify-top-200-charts-20202021
- getting 20 songs from every artist
    - getting it from genius.com with lyricsgenius
        - https://github.com/johnwmillr/LyricsGenius
    - sortet after popularity
- writing a txt file for every artist and saving it to 'datasets/genius/'

In [30]:
# constants
NUM_SONGS = 50
NUM_ARTISTS = 108

In [31]:
topArtistsList = topArtistsList[:NUM_ARTISTS]
# print(topArtists)


In [32]:
# write top artists to file
with open(root + 'datasets/topArtists.txt', 'w') as f:
    for item in topArtistsList:
        f.write("%s\n" % item)

# insert new line for every ',' in file topArtists.txt
with open(root + 'datasets/topArtists.txt', 'r') as f:
    lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = lines[i].replace(',', '\n')
    with open(root + 'datasets/topArtists.txt', 'w') as f:
        f.writelines(lines)
f.close()



In [33]:
# read top artists from file
with open(root + 'datasets/topArtists.txt', 'r') as f:
    topArtists = f.read().split('\n')
    
# sort list alphabetically
topArtists.sort()

# remove empty strings from list
topArtists = list(filter(None, topArtists))

# remove whitespaces from start and end of strings
topArtists = [x.strip() for x in topArtists]

# remove duplicates
topArtists = list(dict.fromkeys(topArtists))


# print every artist from list 
for artist in topArtists:
    print(artist)
    
print(len(topArtists))



AC/DC
Aerosmith
Alice Cooper
Amy Winehouse
Anthrax
Aretha Franklin
B.B. King
Beastie Boys
Beyoncé
Billy Joel
Black Sabbath
Blondie
Bo Diddley
Bob Dylan
Bob Marley
Bruce Springsteen
Buddy Holly
Chuck Berry
Creedence Clearwater Revival
Daft Punk
David Bowie
Def Leppard
Depeche Mode
Diana Ross
Eagles
Earth Wind & Fire
Elton John
Elvis Costello
Elvis Presley
Eminem
Eurythmics
Fleetwood Mac
Foo Fighters
Frank Sinatra
Genesis
Guns N' Roses
Hall & Oates
Hank Williams
Iron Maiden
James Brown
Janis Joplin
Jay-Z
Jerry Lee Lewis
Jimi Hendrix
John Lennon
Johnny Cash
Joni Mitchell
Judas Priest
Justin Timberlake
KISS
Kraftwerk
Led Zeppelin
Little Richard
Lynyrd Skynyrd
Madonna
Marvin Gaye
Megadeth
Metallica
Michael Jackson
NWA
Nas
Neil Young
Nirvana
Otis Redding
Outkast
Parliament Funkadelic
Paul McCartney
Pearl Jam
Pink Floyd
Prince
Public Enemy
Queen
R.E.M.
Radiohead
Ray Charles
Rod Stewart
Run-D.M.C.
Sam Cooke
Santana
Sex Pistols
Simon & Garfunkel
Slayer
Smokey Robinson & The Miracles
Stevie Wond

In [35]:
# get lyrics for top artists
artistFile = []
for person in topRapper:
    print(person)
    try:
        artistFile = genius.search_artist(person, max_songs=NUM_SONGS, sort="popularity")
        # artistFile.save_lyrics(extension='txt', verbose=True)
        # if songs found, write to file
        if artistFile.songs:
            writeSongToFile(artistFile)
        else:
            print('No songs found for ' + person)
    except:
        print('Error with:', person,'Got:', artistFile)

50 Cent
Searching for songs by 50 Cent...

Song 1: "In da Club"
Song 2: "21 Questions"
Song 3: "Many Men (Wish Death)"
Song 4: "P.I.M.P."
Song 5: "Patiently Waiting"
Song 6: "My Life"
Song 7: "Candy Shop"
Song 8: "I’m the Man"
Song 9: "We Up"
Song 10: "Best Friend"
Song 11: "Wanksta"
Song 12: "Ghetto Qu’ran (Forgive Me)"
Song 13: "Hustler’s Ambition"
Song 14: "Big Rich Town"
Song 15: "Window Shopper"
Song 16: "I’m the Man (Remix)"
Song 17: "Heat"
Song 18: "Back Down"
Song 19: "If I Can’t"
Song 20: "What Up Gangsta"
Song 21: "Just a Lil’ Bit"
Song 22: "How to Rob"
Song 23: "Life’s on the Line (Ja Rule Diss)"
Song 24: "When it Rains it Pours"
Song 25: "Don’t Push Me"
Song 26: "High All the Time"
Song 27: "Ayo Technology"
Song 28: "No Romeo No Juliet"
Song 29: "U Not Like Me"
Song 30: "I’ll Whip Ya Head Boy"
Song 31: "P.I.M.P (G-Unit Remix)"
Song 32: "Piggy Bank"
Song 33: "I Get Money"
Song 34: "Major Distribution"
Song 35: "Outta Control (Remix)"
Song 36: "Too Rich for the Bitch"
Song 37

In [37]:
# move lyrics files to geniusDataset
moveLyricsFiles(root, geniusTopRapIMDB)

In [38]:
# get number of files in geniusDataset
print('Genius:', len(os.listdir(geniusDataset)))
print('GeniusTopRap:', len(os.listdir(geniusTopRapIMDB)))
print('GeniusTop:',len(os.listdir(geniusTopIMDB)))


Genius: 0
GeniusTopRap: 724
GeniusTop: 801


### Cleaning Dataset

what to delete:
- all non englisch artists
- `워`
- `Translations`... to `\n`
- all non english characters
- `()` --> Replace with whitespace
- all between `[]`
- number+`Embed`... to `\n`
- `like2Embed` to `\n`

--> delete all non english files manual

In [114]:
# define function to delete al 워 from a txt file from directory path
def deleteAllFromFiles(path):
    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(path + filename, 'r') as f:
                text = f.read()
                # insert new line at end of text
                text = text + '\n'
                text = text.replace('워', '')
                text = text.replace('(', ' ')
                text = text.replace(')', ' ')
                # delete all between []
                text = re.sub(r'\[.*?\]', '', text)
                # delete all number + 'Embed' to \n
                text = re.sub(r'\d*Embed.*?\n', '', text)
                # delete all from 'Translations' to \n
                text = re.sub(r'Translations*?\n', '', text)
                # delete all chinese characters
                text = re.sub(r'[\u4e00-\u9fff]+', '', text)
                # delete all non english characters
                text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
                # delete all empty lines except the last in the file
                text = re.sub(r'(?m)^\s*?\n', '', text)
                # delete all whitespace in front of a line
                text = re.sub(r'(?m)^\s+', '', text)

            
                text = text.replace('You might also likeEmbed', '')
                text = text.replace('Embed\n', '\n')
                text = text.replace('You might also like', '\n')

            with open(path + filename, 'w') as f:
                f.write(text)


In [115]:
# test deleteAllFromFiles-function
deleteAllFromFiles(root + 'datasets/geniusClean/')

---
