### Downloading the Dataset

In [None]:
!git clone https://github.com/kaazima/Dataset.git

Cloning into 'Dataset'...
remote: Enumerating objects: 10942, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 10942 (delta 0), reused 0 (delta 0), pack-reused 10940[K
Receiving objects: 100% (10942/10942), 1.84 GiB | 36.28 MiB/s, done.
Resolving deltas: 100% (510/510), done.
Checking out files: 100% (10001/10001), done.


### Rename the directories & their paths

In [None]:
import os, sys

dir_tree = '/content/Dataset/MillionSongSubset'

for dir_path, dir_names, file_names in os.walk(dir_tree):
    for file_name in file_names:
        try:
            os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
        except OSError:
            print ("Could not move %s " % os.join(dir_path, file_name))

### Compiling a Title-Artist table

In [None]:
import pandas as pd

def make_artist_table(base):

# Get file names

    files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
    data = {'file':[], 'artist':[], 'title':[]}

    # Add artist and title data to dictionary
    for f in files:
        store = pd.HDFStore(f)
        title = store.root.metadata.songs.cols.title[0]
        artist = store.root.metadata.songs.cols.artist_name[0]
        data['file'].append(os.path.basename(f))
        data['title'].append(title.decode("utf-8"))
        data['artist'].append(artist.decode("utf-8"))
        store.close()
    
    # Convert dictionary to pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='columns')
    df = df[['file', 'artist', 'title']]
    return df

In [None]:
base = '/content/Dataset/MillionSongSubset'
df = make_artist_table(base)
df.tail()

Unnamed: 0,file,artist,title
9995,TRADFXD128F424694C.h5,Mando Diao,Misty Mountains
9996,TRBEDRW128F425F424.h5,Savoy Brown,All Burned Out
9997,TRBCURH12903CAD622.h5,Percy Faith,The Song Is You
9998,TRAIPZK12903CE9F24.h5,Cauda Pavonis,Juggernaut
9999,TRAKILC128F933EC35.h5,Johnny Otis,Lovers Lane Boogie


### Downloading Lyrics

In [None]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()

Unnamed: 0,file,artist,title,lyrics
9995,TRADFXD128F424694C.h5,Mando Diao,Misty Mountains,
9996,TRBEDRW128F425F424.h5,Savoy Brown,All Burned Out,
9997,TRBCURH12903CAD622.h5,Percy Faith,The Song Is You,
9998,TRAIPZK12903CE9F24.h5,Cauda Pavonis,Juggernaut,
9999,TRAKILC128F933EC35.h5,Johnny Otis,Lovers Lane Boogie,


In [None]:
import urllib
import bs4
from bs4 import BeautifulSoup, NavigableString
          
def songlyrics(artist, title):
    artist = urllib.parse.quote(artist.lower().replace(' ','_'))
    title = urllib.parse.quote(title.lower().replace(' ','_'))
    try:
        lyrics = urllib.request.urlopen('https://www.lyricsmania.com/%s_lyrics_%s.html' % (title,artist))
    except:
        return ''
    text = lyrics.read()
    soup = bs4.BeautifulSoup(text)
    if not soup.find(class_ = 'lyrics-body'):
        return ''
    lyrics = [x.strip() if isinstance(x, NavigableString) else x.text.strip() for x in soup.find(class_ = 'lyrics-body').contents[2:]]
    if not lyrics:
        return ''
    string='\n'.join(lyrics)
    return string

In [None]:
lyr = songlyrics('Mastodon','Deep Sea Creature')
print(lyr)

Knowing right

Learning wrong

What you're feeling is pressure

Pulsate new blood

I've seen things you've heard never before

Bones aged in dust

Buy your bite take the body

Calm

Shutting down

You built me

I knew it

I'll never lie liar

You fed me

I chewed it

I'll never lie liar

I follow you covering me

Spirits in pieces

Crumbled and burnt


### Adding lyrics to the Dataframe

In [None]:
!pip install pyprind

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2


In [None]:
import pyprind

pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    lyr = songlyrics(df.loc[row_id]['artist'],df.loc[row_id]['title'])
    df.loc[row_id,'lyrics'] = lyr
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:56:16


In [None]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()

downloaded Lyrics for 91 songs


Unnamed: 0,file,artist,title,lyrics
0,TRADJKG12903CE049F.h5,Tesla,So What!,There's things that I've bought\n\nAnd there's...
1,TRBGLST128F934DAF2.h5,Yami Bolo,Put Down Your Weapons,
2,TRAICJQ12903CF2681.h5,Dark Fortress,Ghastly Indoctrination,In mysterious ways your soul sinks down\n\nAs ...
3,TRAZNDO12903CF3CE6.h5,Lorna,He Dreams Of Spaceships,
4,TRARYHD128F9344512.h5,Trafik,Disco Trafiko,


### Remove rows where lyrics are not available

In [None]:
df = df[df.lyrics!='']
print("No of songs: %s" %sum(df.lyrics!=''))
df.head()

No of songs: 91


Unnamed: 0,file,artist,title,lyrics
0,TRADJKG12903CE049F.h5,Tesla,So What!,There's things that I've bought\n\nAnd there's...
2,TRAICJQ12903CF2681.h5,Dark Fortress,Ghastly Indoctrination,In mysterious ways your soul sinks down\n\nAs ...
5,TRAUZFY128F42BCE57.h5,Jason Michael Carroll,No Good In Goodbye,I knew you wouldn't answer after the things we...
11,TRAAVRJ128F92FF90A.h5,!!!,Myth Takes,It only takes a little bit of glamour glimmer\...
12,TRAOQSA128F42AE616.h5,El Presidente,Turn This Thing Around,Turn this thing around\n\nTurn this thing arou...


### Remove the songs that are not in English

In [None]:
import nltk
nltk.download('words')

def eng_ratio(text):
    # Returns the ratio of non-English to English words from a text
    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    if len(text_vocab)==0:
        # Can't find the value because of the presence of division by 0. So, returning a value 1
        return 1
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    return diff

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
text1 = 'This is a test'
print(eng_ratio(text1))
text2 = 'Ye jaanch karne ke liye hai'
print(eng_ratio(text2))
text3 = 'This is a test hai'
print(eng_ratio(text3))
text4 = '   \n '
print(eng_ratio(text4))

0.0
0.8333333333333334
0.2
1


text1 is 0% non-English (<50%) => It is English

text2 is 83.33% non-English (>50%) => It is Non-English

text3 is 20% non-English (<50%) => It is English

text4 is empty. So it is assumed to be 100% non-English (>50%) => It is Non-English

In [None]:
pbar = pyprind.ProgBar(df.shape[0])

before = df.shape[0]
for row_id in df.index:
    text = df.loc[row_id]['lyrics']
    diff = eng_ratio(text)
    if diff >= 0.5:
        df = df[df.index != row_id]
    pbar.update()
after = df.shape[0]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:16


In [None]:
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)
df.head()

15 have been removed.
76 songs remain in the dataset.


Unnamed: 0,file,artist,title,lyrics
0,TRADJKG12903CE049F.h5,Tesla,So What!,There's things that I've bought\n\nAnd there's...
2,TRAICJQ12903CF2681.h5,Dark Fortress,Ghastly Indoctrination,In mysterious ways your soul sinks down\n\nAs ...
5,TRAUZFY128F42BCE57.h5,Jason Michael Carroll,No Good In Goodbye,I knew you wouldn't answer after the things we...
11,TRAAVRJ128F92FF90A.h5,!!!,Myth Takes,It only takes a little bit of glamour glimmer\...
12,TRAOQSA128F42AE616.h5,El Presidente,Turn This Thing Around,Turn this thing around\n\nTurn this thing arou...


In [None]:
df.to_csv('/content/df_lyr_backup.csv')