# Data Engineering
`Donwloading and creating the dataset for the LyricsGenerator task.`

In [2]:
# installation: 
! pip install lyricsgenius
! pip install pandas
! pip install regex



In [3]:
# imports and setup
import lyricsgenius
import os
import pandas as pd
import regex as re


In [4]:
# file paths
root = './'
kaggleDataset = root + 'datasets/kaggle/'
geniusDataset = root + 'datasets/genius/'

## GeniusLyrics-crawler

Using: 
- the Genius API from genius.com (https://docs.genius.com/#/getting-started-h1)
- the LyricsGenius-Framework (https://github.com/johnwmillr/LyricsGenius)
---

In [5]:
# Genius API Access Token
GENIUS_ACCESS_TOKEN = 'NyTFYvAmDH69V0YgfeTOnTHfMZ69KwRmyQMWWDgLn4X1SvOujMM18Pe0bssgx3FR' # Your Genius access token

### test lyricsgenius framework

In [6]:
# test connection
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)

artist = genius.search_artist("Ed Sheeran", max_songs=2, sort="popularity")
print(artist.songs)

Searching for songs by Ed Sheeran...

Song 1: "Shape of You"
Song 2: "Perfect"

Reached user-specified song limit (2).
Done. Found 2 songs.
[Song(id, artist, ...), Song(id, artist, ...)]


In [7]:
print(artist.songs[0].lyrics)

TranslationsTürkçeEspañolPortuguêsΕλληνικάDeutschFrançaisShape of You Lyrics[Verse 1]
The club isn't the best place to find a lover
So the bar is where I go
Me and my friends at the table doing shots
Drinking fast and then we talk slow
And you come over and start up a conversation with just me
And trust me I'll give it a chance now
Take my hand, stop, put Van the Man on the jukebox
And then we start to dance, and now I'm singing like

[Pre-Chorus]
Girl, you know I want your love
Your love was handmade for somebody like me
Come on now, follow my lead
I may be crazy, don't mind me
Say, boy, let's not talk too much
Grab on my waist and put that body on me
Come on now, follow my lead
Come, come on now, follow my lead
[Chorus]
I'm in love with the shape of you
We push and pull like a magnet do
Although my heart is falling too
I'm in love with your body
And last night you were in my room
And now my bed sheets smell like you
Every day discovering something brand new
I'm in love with your body

In [8]:
# save lyrics to path
artist.save_lyrics(extension='txt', verbose=True)

Wrote Lyrics_EdSheeran.txt.


In [9]:
# move all files starting with 'Lyrics_' to '/datasets/genius'
def moveLyricsFiles(fromPath, toPath):
    for filename in os.listdir(fromPath):
        if filename.startswith('Lyrics_'):
            os.rename(filename, toPath + filename)

# moveLyricsFiles(root, geniusDataset)

### get the top 100 artists
using:
- https://www.kaggle.com/datasets/sashankpillai/spotify-top-200-charts-20202021

In [10]:
# load csv to dataframe
df = pd.read_csv(root + 'datasets/spotify_dataset.csv')


In [11]:
# sort the dataframe by Streams
df = df.sort_values(by=['Streams'], ascending=True)
df.head()

Unnamed: 0,Index,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
1380,1381,54,1,2020-02-21--2020-02-28,Louder than bombs,10016907,BTS,37151476,3op7HNwLli54MBjFGzIlZO,"['k-pop', 'k-pop boy group']",...,0.643,0.667,-6.993,0.152,0.0433,0.0904,145.999,217404,0.485,C
479,480,60,1,2021-03-19--2021-03-26,Love You Different (feat. BEAM),10024729,Justin Bieber,48544923,27UcQ6dAvQrgH9C880rCM3,"['canadian pop', 'pop', 'post-teen pop']",...,0.759,0.584,-6.655,0.06,0.0652,0.16,127.138,186696,0.374,G#/Ab
48,49,29,6,2021-06-25--2021-07-02,Fiel - Remix,10032746,"Wisin, Jhay Cortez, Anuel AA, Los Legendarios,...",6929075,43qcs9NpJhDxtG91zxFkj7,"['latin', 'latin hip hop', 'reggaeton', 'trap ...",...,0.839,0.711,-4.733,0.0473,0.398,0.118,97.99,349547,0.573,F#/Gb
1278,1279,51,1,2020-03-20--2020-03-27,Repeat After Me (Interlude),10058303,The Weeknd,35340069,40U8d12pC5UHqmHwXjHjjl,"['canadian contemporary r&b', 'canadian pop', ...",...,0.555,0.619,-7.918,0.0314,0.0323,0.0917,95.032,195813,0.141,C
1487,1488,62,1,2020-01-17--2020-01-24,I Can See,10086930,Mac Miller,6189454,3R8CyhJfVjvgIROd5RSGhQ,"['hip hop', 'pittsburgh rap', 'rap']",...,0.699,0.503,-9.922,0.0882,0.391,0.0996,114.86,220853,0.0546,F


In [12]:
# get list of first 20 artists from df
topArtists = df['Artist'].head(500).tolist()
# topArtists.sort()
# delete duplicates
topArtists = list(dict.fromkeys(topArtists))
print(len(topArtists))
# print(topArtists)


311


### generate genius-dataset
using:
- top 100 Artists from streaming numbers extracted from:
    - https://www.kaggle.com/datasets/sashankpillai/spotify-top-200-charts-20202021
- getting 20 songs from every artist
    - getting it from genius.com with lyricsgenius
        - https://github.com/johnwmillr/LyricsGenius
    - sortet after popularity
- writing a txt file for every artist and saving it to 'datasets/genius/'

In [13]:
# constants
NUM_SONGS = 20
NUM_ARTISTS = 100

In [14]:
topArtists = topArtists[:NUM_ARTISTS]
# print(topArtists)


In [15]:
# write top artists to file
with open(root + 'datasets/topArtists.txt', 'w') as f:
    for item in topArtists:
        f.write("%s\n" % item)

# insert new line for every ',' in file topArtists.txt
with open(root + 'datasets/topArtists.txt', 'r') as f:
    lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = lines[i].replace(',', '\n')
        lines[i] = lines[i].replace('&', '\n')
    with open(root + 'datasets/topArtists.txt', 'w') as f:
        f.writelines(lines)
f.close()



In [16]:
# read top artists from file
with open(root + 'datasets/topArtists.txt', 'r') as f:
    topArtists = f.read().split('\n')
    
print(topArtists)

['BTS', 'Justin Bieber', 'Wisin', ' Jhay Cortez', ' Anuel AA', ' Los Legendarios', ' Myke Towers', 'The Weeknd', 'Mac Miller', 'Tiago PZK', ' LIT killah', '24kGoldn', 'Galantis', 'Lady Gaga', 'Lil Uzi Vert', 'Coldplay', 'Band Aid', 'The Ronettes', 'Queen', 'Daryl Hall ', ' John Oates', 'Future', ' Drake', ' Young Thug', 'Kid Cudi', 'J. Cole', 'Bad Bunny', 'Tiësto', 'Frank Sinatra', 'Eminem', 'Dean Martin', 'Ariana Grande', 'Bad Bunny', ' ABRA', 'Taylor Swift', 'J Balvin', ' KAROL G', ' Nicky Jam', ' Crissin', ' Totoy El Frio', ' Natan ', ' Shander', 'Olivia Rodrigo', 'Internet Money', ' Gunna', ' Don Toliver', ' NAV', 'Tyler', ' The Creator', 'TINI', ' Maria Becerra', 'Bad Bunny', ' Jhay Cortez', 'Sech', ' Jhay Cortez', 'Doja Cat', ' The Weeknd', 'José Feliciano', 'Polo G', 'The Kid LAROI', 'Harry Styles', 'Kelly Clarkson', 'Sebastian Yatra', ' Myke Towers', 'Camila Cabello', 'Bruno Mars', ' Anderson .Paak', ' Silk Sonic', 'Masked Wolf', 'Riton', ' Nightcrawlers', 'Post Malone', 'Andy 

In [34]:
# get lyrics for top artists
for person in topArtists:
    try:
        artistFile = genius.search_artist(person, max_songs=NUM_SONGS, sort="popularity", verbose=True)
        artistFile.save_lyrics(extension='txt', verbose=True)
    except:
        print('Error with:', person,'Got:', artistFile)

Searching for songs by BTS...

Error with  BTS
Searching for songs by Justin Bieber...

Error with  Justin Bieber
Searching for songs by Wisin...

Error with  Wisin
Searching for songs by  Jhay Cortez...

Changing artist name to 'Bad Bunny & Jhay Cortez'
Song 1: "DÁKITI"
Song 2: "CÓMO SE SIENTE (Remix)"
Song 3: "Tarot"
Song 4: "DÁKITI (Remix)"
Done. Found 4 songs.
Wrote Lyrics_BadBunnyJhayCortez.txt.
Searching for songs by  Anuel AA...

Changing artist name to 'Anuel AA'
Error with   Anuel AA
Searching for songs by  Los Legendarios...

Changing artist name to 'Los Legendarios'


In [81]:
# move lyrics files to geniusDataset
moveLyricsFiles(root, geniusDataset)

In [82]:
# get number of files in geniusDataset
print(len(os.listdir(geniusDataset)))

49


### Cleaning Dataset

what to delete:
- all non englisch artists
- `워`
- `Translations`... to `\n`
- all non english characters
- `()` --> Replace with whitespace
- all between `[]`
- number+`Embed`... to `\n`
- `like2Embed` to `\n`

In [114]:
# define function to delete al 워 from a txt file from directory path
def deleteAllFromFiles(path):
    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(path + filename, 'r') as f:
                text = f.read()
                # insert new line at end of text
                text = text + '\n'
                text = text.replace('워', '')
                text = text.replace('(', ' ')
                text = text.replace(')', ' ')
                # delete all between []
                text = re.sub(r'\[.*?\]', '', text)
                # delete all number + 'Embed' to \n
                text = re.sub(r'\d*Embed.*?\n', '', text)
                # delete all from 'Translations' to \n
                text = re.sub(r'Translations*?\n', '', text)
                # delete all chinese characters
                text = re.sub(r'[\u4e00-\u9fff]+', '', text)
                # delete all non english characters
                text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
            
                text = text.replace('You might also likeEmbed', '')
                text = text.replace('Embed\n', '\n')
                text = text.replace('You might also like', '\n')

            with open(path + filename, 'w') as f:
                f.write(text)


In [115]:
# test deleteAllFromFiles-function
deleteAllFromFiles(root + 'datasets/geniusClean/')

---
