## Pre-process scraped data

In [1]:
import pandas as pd 
from collections import Counter
import re
import os 

### 1. Construct dataframe of all songs

In [33]:
genres = os.listdir('scrapped_data')
all_genres = pd.DataFrame()
for genre in genres:
    basepath = f'./scrapped_data/{genre}/'
    filepaths = [basepath + f for f in os.listdir(f'./scrapped_data/{genre}/') if f.endswith('.csv')]
    genre_df = pd.concat(map(pd.read_csv, filepaths))
    genre_df['genre'] = genre
    all_genres = pd.concat([all_genres, genre_df], ignore_index=True).drop(columns='Unnamed: 0')
all_genres

Unnamed: 0,name,author,link,lyrics,chords,genre
0,As Long As The Grass Shall Grow,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"As long, As the moon shall rise, As ...","['A', 'E', 'A', 'A', 'E', 'A', 'D', 'D', 'A', ...",Country
1,Delias Gone (ver 4),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Delia, oh, Delia Delia all my life\r\nIf I had...","['A', 'D', 'A', 'A7', 'D', 'B7', 'E', 'A', 'D'...",Country
2,Mean Eyed Cat,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['E', 'B7', 'E', 'A7', 'B7', 'E', 'E', 'B7', '...",Country
3,Mean Eyed Cat (ver 2),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['C', 'G7', 'C', 'F', 'G7', 'C', 'F', 'C', 'F'...",Country
4,Mean Eyed Cat (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I give my woman half my money at the general s...,"['F', 'C', 'F', 'Ab', 'Bb', 'C', 'Fm', 'F', 'F...",Country
...,...,...,...,...,...,...
46923,Fake Love Dont Last,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,"I watched a movie scene, got déjà vu\r\n'Cause...","['Em', 'G', 'B', 'C', 'Em', 'G', 'B', 'C', 'Em...",Rock
46924,Sid And Nancy,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,I knew a girl who'd wear my t-shirts when she ...,"['D', 'F#m', 'E', 'D', 'F#m', 'E', 'D', 'F#m',...",Rock
46925,Forever,Stereophonics,https://tabs.ultimate-guitar.com/tab/stereopho...,Sun beats down on my mind on a friday morning...,"['G', 'C', 'G', 'C', 'G', 'C', 'G', 'C', 'D', ...",Rock
46926,Goodnight Chicago,Rainbow Kitten Surprise,https://tabs.ultimate-guitar.com/tab/rainbow-k...,Twenty years to see New York reflected on subw...,"['C', 'C', 'Am', 'F', 'C', 'C', 'C', 'Am', 'F'...",Rock


### 2. Remove duplicates

Parse real name of a song i.e. remove the version. Notice song on index 7 and 8.

In [18]:
all_genres['actual_name'] = all_genres.name.apply(lambda name : name.split(' (ver')[0])
all_genres.head(10)

Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
0,As Long As The Grass Shall Grow,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"As long, As the moon shall rise, As ...","['A', 'E', 'A', 'A', 'E', 'A', 'D', 'D', 'A', ...",Country,As Long As The Grass Shall Grow
1,Delias Gone (ver 4),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Delia, oh, Delia Delia all my life\r\nIf I had...","['A', 'D', 'A', 'A7', 'D', 'B7', 'E', 'A', 'D'...",Country,Delias Gone
2,Mean Eyed Cat,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['E', 'B7', 'E', 'A7', 'B7', 'E', 'E', 'B7', '...",Country,Mean Eyed Cat
3,Mean Eyed Cat (ver 2),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['C', 'G7', 'C', 'F', 'G7', 'C', 'F', 'C', 'F'...",Country,Mean Eyed Cat
4,Mean Eyed Cat (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I give my woman half my money at the general s...,"['F', 'C', 'F', 'Ab', 'Bb', 'C', 'Fm', 'F', 'F...",Country,Mean Eyed Cat
5,I Couldnt Keep From Crying,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"I SAW YOU WALKING BY HIS SIDE,\r\nHEARD YOU W...","['A', 'A7', 'D', 'A', 'E7', 'A', 'A7', 'D', 'A...",Country,I Couldnt Keep From Crying
6,It Aint Me Babe (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Go 'way from my window leave at your own chose...,"['A', 'Bm', 'A', 'E', 'A', 'A', 'Bm', 'A', 'E'...",Country,It Aint Me Babe
7,Wildwood Flower,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Oh I’ll twine with my mingles of raven black h...,"['E', 'B', 'E', 'E', 'B', 'E', 'E', 'A', 'E', ...",Country,Wildwood Flower
8,Im So Lonesome I Could Cry,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Hear that / lonesome / Whipoor - / will,\r\nHe...","['C', 'Am', 'C', 'C', 'Am', 'C', 'C7', 'F', 'A...",Country,Im So Lonesome I Could Cry
9,Truck Driving Man,Glen Campbell,https://tabs.ultimate-guitar.com/tab/glen-camp...,"Well I stopped at a roadhouse in Texas, with a...","['G', 'C', 'G', 'D', 'G', 'G7', 'C', 'D', 'D7'...",Country,Truck Driving Man


In [19]:
print(all_genres.actual_name.value_counts())
all_genres.loc[all_genres.actual_name.str.contains('What A Wonderful World')].head(10)

What A Wonderful World       39
Georgia On My Mind           37
Cant Take My Eyes Off You    33
Crazy                        33
Easy                         31
                             ..
Inception                     1
Fake Mona Lisa                1
Airtime                       1
God Bless America Again       1
Lovelovelove                  1
Name: actual_name, Length: 27285, dtype: int64


Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
12226,What A Wonderful World,Stacey Kent,https://tabs.ultimate-guitar.com/tab/stacey-ke...,"I see trees of green, red roses too\r\n ...","['A', 'C#m7', 'Dmaj7', 'C#m7', 'Bm7', 'A', 'C#...",Jazz,What A Wonderful World
12433,What A Wonderful World (ver 9),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
12438,What A Wonderful World (ver 2),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\n I...","['F', 'Am', 'Bb', 'Am', 'Gm7', 'F', 'A7', 'Dm'...",Jazz,What A Wonderful World
12448,What A Wonderful World,Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\n ...","['F', 'Am', 'Bb', 'Am', 'Gm', 'F', 'A7', 'Dm',...",Jazz,What A Wonderful World
12461,What A Wonderful World (ver 3),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['G', 'Bm', 'C', 'Bm', 'Am7', 'G', 'B7', 'Em',...",Jazz,What A Wonderful World
12474,What A Wonderful World (ver 11),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'G', 'Am', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
12478,What A Wonderful World (ver 4),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['F', 'Am', 'Bb', 'Am', 'Gm7', 'F', 'A7', 'Dm'...",Jazz,What A Wonderful World
12577,What A Wonderful World,Willie Nelson,https://tabs.ultimate-guitar.com/tab/willie-ne...,"I see trees of green, red roses too\r\nI see t...","['D', 'F#m', 'G', 'F#m', 'Em', 'D', 'F#7', 'Bm...",Jazz,What A Wonderful World
12779,What A Wonderful World (ver 14),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too,\r\nI see ...","['C', 'G', 'Am', 'Em', 'F', 'Em', 'Dm', 'Am', ...",Jazz,What A Wonderful World
12790,What A Wonderful World (ver 7),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm7', 'C', 'E7', 'Am',...",Jazz,What A Wonderful World


Remove duplicate songs that have the same name, have same author, but in different versions.

In [20]:
all_genres_unique = all_genres.drop_duplicates(subset=['actual_name', 'author', 'genre']).reset_index()
all_genres_unique

Unnamed: 0,index,name,author,link,lyrics,chords,genre,actual_name
0,0,As Long As The Grass Shall Grow,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"As long, As the moon shall rise, As ...","['A', 'E', 'A', 'A', 'E', 'A', 'D', 'D', 'A', ...",Country,As Long As The Grass Shall Grow
1,1,Delias Gone (ver 4),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Delia, oh, Delia Delia all my life\r\nIf I had...","['A', 'D', 'A', 'A7', 'D', 'B7', 'E', 'A', 'D'...",Country,Delias Gone
2,2,Mean Eyed Cat,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['E', 'B7', 'E', 'A7', 'B7', 'E', 'E', 'B7', '...",Country,Mean Eyed Cat
3,5,I Couldnt Keep From Crying,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"I SAW YOU WALKING BY HIS SIDE,\r\nHEARD YOU W...","['A', 'A7', 'D', 'A', 'E7', 'A', 'A7', 'D', 'A...",Country,I Couldnt Keep From Crying
4,6,It Aint Me Babe (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Go 'way from my window leave at your own chose...,"['A', 'Bm', 'A', 'E', 'A', 'A', 'Bm', 'A', 'E'...",Country,It Aint Me Babe
...,...,...,...,...,...,...,...,...
34360,46920,Wurli,Dominic Fike,https://tabs.ultimate-guitar.com/tab/dominic-f...,Steppin' outside for you\r\nAnd I put links on...,"['B', 'D#', 'G#m', 'E', 'B', 'D#', 'G#m', 'E',...","R&b, Funk & Soul",Wurli
34361,46921,Helium,Glass Animals,https://tabs.ultimate-guitar.com/tab/glass-ani...,9:09\r\n You gonna call it or am I?\r\n One m...,"['A', 'B', 'A', 'B', 'A', 'B', 'C#m', 'F#m', '...","R&b, Funk & Soul",Helium
34362,46924,Tokyo Love Hotel,Rina Sawayama,https://tabs.ultimate-guitar.com/tab/rina-sawa...,(Tokyo)\r\n(Tokyo)\r\n(Tokyo) People don't kno...,"['A', 'B', 'E', 'A', 'A', 'B', 'A', 'B', 'G#m'...","R&b, Funk & Soul",Tokyo Love Hotel
34363,46925,Hurt,Arlo Parks,https://tabs.ultimate-guitar.com/tab/arlo-park...,"Mmm, mmm\r\nMmm, mmm, mmm, mmm\r\nCharlie dr...","['Em', 'D', 'Bm', 'Em', 'D', 'Bm', 'Em', 'D', ...","R&b, Funk & Soul",Hurt


Notice some songs are performed by different artists and should also be removed based on the lyrics I guess. See example:

In [21]:
print(all_genres_unique.actual_name.value_counts())
all_genres_unique.loc[all_genres_unique.actual_name.str.contains('What a wonderful world', flags=re.IGNORECASE)].head(10)

Home                      16
Without You               16
Stay                      16
Run                       15
Heaven                    15
                          ..
The New Breed              1
The Weather In My Head     1
Planet Drhonda             1
Slinky Thing               1
Lovelovelove               1
Name: actual_name, Length: 27285, dtype: int64


Unnamed: 0,index,name,author,link,lyrics,chords,genre,actual_name
9457,12226,What A Wonderful World,Stacey Kent,https://tabs.ultimate-guitar.com/tab/stacey-ke...,"I see trees of green, red roses too\r\n ...","['A', 'C#m7', 'Dmaj7', 'C#m7', 'Bm7', 'A', 'C#...",Jazz,What A Wonderful World
9657,12433,What A Wonderful World (ver 9),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
9756,12577,What A Wonderful World,Willie Nelson,https://tabs.ultimate-guitar.com/tab/willie-ne...,"I see trees of green, red roses too\r\nI see t...","['D', 'F#m', 'G', 'F#m', 'Em', 'D', 'F#7', 'Bm...",Jazz,What A Wonderful World
10305,13308,What A Wonderful World,Eva Cassidy,https://tabs.ultimate-guitar.com/tab/eva-cassi...,"I see trees that are green, and red roses too...","['D', 'D7M', 'G', 'D', 'Em', 'D', 'F#', 'Bm', ...",Jazz,What A Wonderful World
11127,14381,What A Wonderful World,Jon Batiste,https://tabs.ultimate-guitar.com/tab/jon-batis...,"I see trees of green, red roses too\r\nI see t...","['E', 'A', 'E', 'Asus2', 'Asus2', 'E', 'C#m', ...",Jazz,What A Wonderful World
11374,14703,What A Wonderful World (ver 5),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,I see trees of green red roses too\r\nI see th...,"['G', 'D', 'Am7', 'G', 'C', 'G', 'D', 'A7sus',...",Pop,What A Wonderful World
16195,20763,Somewhere Over The Rainbow What A Wonderful World,Robin Schulz,https://tabs.ultimate-guitar.com/tab/robin-sch...,Somewhere over the rainbow\r\n Bluebirds fly\...,"['C', 'Em', 'F', 'C', 'F', 'C', 'G', 'Am', 'F'...",Pop,Somewhere Over The Rainbow What A Wonderful World
26486,34551,What A Wonderful World,Joey Ramone,https://tabs.ultimate-guitar.com/tab/joey-ramo...,"I see trees of green / Red roses, too / I see ...","['F', 'Am', 'Bb', 'Am', 'Gm', 'F', 'A7', 'Dm',...",Rock,What A Wonderful World


### 3. Remove non-English songs

In [22]:
from langdetect import detect_langs, detect

def get_lang(text):
    print(f'{text} -- {detect_langs(text)}')
    print(detect(text))
get_lang('Hello')
get_lang('hi what is up')
get_lang('bonjour comment ca va')
get_lang('hej vad gör du här ens')

Hello -- [fi:0.712034517140546, no:0.1451095389458799, nl:0.14285587557225196]
fi
hi what is up -- [en:0.9999956880369661]
en
bonjour comment ca va -- [fr:0.9999973151079604]
fr
hej vad gör du här ens -- [sv:0.9999980155163134]
sv


In [23]:
print(all_genres_unique.iloc[0, :].lyrics[:20])

     As long,  As th


In [24]:
detect_langs(all_genres_unique.iloc[0, :].lyrics[20:])

[en:0.9999967821814468]

In [25]:
def get_lang(text):
    try:
        language = detect(text)
    except:
        language = 'not identified'
    return language
all_genres_unique['language'] = all_genres_unique['lyrics'].apply(lambda lyrics : get_lang(lyrics[:200]))

In [26]:
all_genres_unique.loc[all_genres_unique.language == 'not identified']

Unnamed: 0,index,name,author,link,lyrics,chords,genre,actual_name,language
1328,1389,Are You Sure,Willie Nelson,https://tabs.ultimate-guitar.com/tab/willie-ne...,_________2_\r\n_________3_|\r\n_________2_\r\n,"['D', 'G', 'D/F#']",Country,Are You Sure,not identified
3495,3948,Some Go Home,Jerry Jeff Walker,https://tabs.ultimate-guitar.com/tab/jerry-jef...,#\r\n#\r\n#\r\n,"['C', 'G', 'D', 'Dsus4', 'D', 'C', 'Dsus4', 'D...",Country,Some Go Home,not identified
9429,12195,Hang Em High,Booker T. & the M.G.'s,https://tabs.ultimate-guitar.com/tab/booker-t-...,\r\n \r\n \r\n ...,"['Em', 'D', 'Em', 'D', 'Em', 'D', 'Em', 'A', '...",Jazz,Hang Em High,not identified
9684,12484,Europa,Santana,https://tabs.ultimate-guitar.com/tab/santana/e...,|--------|\r\n|--------|--------|--------|----...,"['Bb7sus4', 'Bb7', 'Ebmaj7', 'Abmaj7', 'G7sus4...",Jazz,Europa,not identified
9974,12867,The Shaker Song,The Manhattan Transfer,https://tabs.ultimate-guitar.com/tab/the-manha...,* * * * * * * * * * * *...,"['G6', 'G11', 'G6', 'G11', 'G6', 'G11', 'G6', ...",Jazz,The Shaker Song,not identified
10088,13016,Igy,Donald Fagen,https://tabs.ultimate-guitar.com/tab/donald-fa...,* * * * * * * * * ...,"['Abm', 'Dbm9', 'Emaj9', 'E/F#', 'Abm', 'Dbm9'...",Jazz,Igy,not identified
10458,13515,La Fiesta,Chick Corea,https://tabs.ultimate-guitar.com/tab/chick-cor...,|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+...,"['AMaj7', 'Db7', 'DMaj7', 'Ebdim', 'A/E', 'Fdi...",Jazz,La Fiesta,not identified
10507,13579,Three Views Of A Secret,Jaco Pastorius,https://tabs.ultimate-guitar.com/tab/jaco-past...,|-----=-----=-----+-----=-----=-----+-----=---...,"['C#m7', 'Bm7', 'A9', 'F#', 'B9', 'E6', 'G#7',...",Jazz,Three Views Of A Secret,not identified
10536,13616,Wished For You,Squirrel Nut Zippers,https://tabs.ultimate-guitar.com/tab/squirrel-...,\t\t\t ^ ^ ^ ^\r\n,"['Ab7', 'G7', 'Gb7']",Jazz,Wished For You,not identified
10714,13856,Rio Nights,Shakatak,https://tabs.ultimate-guitar.com/tab/shakatak/...,|.... |....\r\n|.... |....\r\n|.... |......,"['Bb/C', 'Fm7', 'Bb/C', 'F7+', 'Bb/C', 'F7+', ...",Jazz,Rio Nights,not identified


In [27]:
all_genres_unique.iloc[1750, :].lyrics

"Once upon a time  In a far off land\r\nWise men saw a sign  And set out 'cross the sand\r\nSongs of praise to sing They travelled day and night\r\nPrecious gifts to bring           Guided by the light.\r\nThey chased a brand new star Ever towards the west\r\nAcross the mountains far  But when it came to rest\r\nThey scarce believed their eyes   They'd come so many miles\r\nThe miracle they prized           Was nothing but a child\r\nNothing but a child               Could wash those tears away\r\nOr guide a weary world            Into the light of day\r\nNothing but a child               Could help erase those miles\r\nSo once again we all              Can be children for a while.\r\nNow all around the world       In every little town\r\nEvery day is heard      A precious little sound\r\nAnd every mother kind          And every father, proud\r\nLooks down in awe to find     Another chance allowed\r\nNothing but a child               Could wash those tears away\r\nOr guide a weary worl

In [28]:
all_genres_unique.language.value_counts()

en                29617
fr                 1231
es                  618
de                  550
it                  504
fi                  390
sv                  235
nl                  217
tl                  180
pl                  167
id                   89
pt                   89
so                   71
hr                   66
sw                   55
ru                   55
not identified       37
no                   32
af                   28
da                   26
cs                   17
tr                   15
cy                   14
ca                   12
ko                   10
hu                    9
et                    7
vi                    6
sk                    5
sl                    4
sq                    4
ro                    3
lv                    1
zh-cn                 1
Name: language, dtype: int64

In [29]:
all_genres_en = all_genres_unique.loc[all_genres_unique.language.isin(['en'])].drop(columns='index').reset_index().drop(columns='index')
all_genres_en

Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name,language
0,As Long As The Grass Shall Grow,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"As long, As the moon shall rise, As ...","['A', 'E', 'A', 'A', 'E', 'A', 'D', 'D', 'A', ...",Country,As Long As The Grass Shall Grow,en
1,Delias Gone (ver 4),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Delia, oh, Delia Delia all my life\r\nIf I had...","['A', 'D', 'A', 'A7', 'D', 'B7', 'E', 'A', 'D'...",Country,Delias Gone,en
2,Mean Eyed Cat,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['E', 'B7', 'E', 'A7', 'B7', 'E', 'E', 'B7', '...",Country,Mean Eyed Cat,en
3,I Couldnt Keep From Crying,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"I SAW YOU WALKING BY HIS SIDE,\r\nHEARD YOU W...","['A', 'A7', 'D', 'A', 'E7', 'A', 'A7', 'D', 'A...",Country,I Couldnt Keep From Crying,en
4,It Aint Me Babe (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Go 'way from my window leave at your own chose...,"['A', 'Bm', 'A', 'E', 'A', 'A', 'Bm', 'A', 'E'...",Country,It Aint Me Babe,en
...,...,...,...,...,...,...,...,...
29612,Wurli,Dominic Fike,https://tabs.ultimate-guitar.com/tab/dominic-f...,Steppin' outside for you\r\nAnd I put links on...,"['B', 'D#', 'G#m', 'E', 'B', 'D#', 'G#m', 'E',...","R&b, Funk & Soul",Wurli,en
29613,Helium,Glass Animals,https://tabs.ultimate-guitar.com/tab/glass-ani...,9:09\r\n You gonna call it or am I?\r\n One m...,"['A', 'B', 'A', 'B', 'A', 'B', 'C#m', 'F#m', '...","R&b, Funk & Soul",Helium,en
29614,Tokyo Love Hotel,Rina Sawayama,https://tabs.ultimate-guitar.com/tab/rina-sawa...,(Tokyo)\r\n(Tokyo)\r\n(Tokyo) People don't kno...,"['A', 'B', 'E', 'A', 'A', 'B', 'A', 'B', 'G#m'...","R&b, Funk & Soul",Tokyo Love Hotel,en
29615,Hurt,Arlo Parks,https://tabs.ultimate-guitar.com/tab/arlo-park...,"Mmm, mmm\r\nMmm, mmm, mmm, mmm\r\nCharlie dr...","['Em', 'D', 'Bm', 'Em', 'D', 'Bm', 'Em', 'D', ...","R&b, Funk & Soul",Hurt,en


### 4. Pre-process lyrics
Remove words between parenthesis, and remove linebreaks \n\r.

For instance, remove the things between the parentheses below:

    If your radiator leaks and your motor stands still
    Doing the Hadacol boogie (Hadacol boogie)
    Hadacol boogie (Hadacol boogie)

In [51]:
all_genres_en['clean_lyrics'] = all_genres_en['lyrics'].apply(lambda lyrics : re.sub('[\(\[].*?[\)\]]', '', lyrics.replace('\r\n',' ')))
all_genres_en

Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name,language,clean_lyrics
0,As Long As The Grass Shall Grow,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"As long, As the moon shall rise, As ...","['A', 'E', 'A', 'A', 'E', 'A', 'D', 'D', 'A', ...",Country,As Long As The Grass Shall Grow,en,"As long, As the moon shall rise, As ..."
1,Delias Gone (ver 4),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Delia, oh, Delia Delia all my life\r\nIf I had...","['A', 'D', 'A', 'A7', 'D', 'B7', 'E', 'A', 'D'...",Country,Delias Gone,en,"Delia, oh, Delia Delia all my life If I hadn't..."
2,Mean Eyed Cat,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,I gave my woman half my money at the general s...,"['E', 'B7', 'E', 'A7', 'B7', 'E', 'E', 'B7', '...",Country,Mean Eyed Cat,en,I gave my woman half my money at the general s...
3,I Couldnt Keep From Crying,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"I SAW YOU WALKING BY HIS SIDE,\r\nHEARD YOU W...","['A', 'A7', 'D', 'A', 'E7', 'A', 'A7', 'D', 'A...",Country,I Couldnt Keep From Crying,en,"I SAW YOU WALKING BY HIS SIDE, HEARD YOU WHIS..."
4,It Aint Me Babe (ver 3),Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Go 'way from my window leave at your own chose...,"['A', 'Bm', 'A', 'E', 'A', 'A', 'Bm', 'A', 'E'...",Country,It Aint Me Babe,en,Go 'way from my window leave at your own chose...
...,...,...,...,...,...,...,...,...,...
29612,Wurli,Dominic Fike,https://tabs.ultimate-guitar.com/tab/dominic-f...,Steppin' outside for you\r\nAnd I put links on...,"['B', 'D#', 'G#m', 'E', 'B', 'D#', 'G#m', 'E',...","R&b, Funk & Soul",Wurli,en,Steppin' outside for you And I put links on bo...
29613,Helium,Glass Animals,https://tabs.ultimate-guitar.com/tab/glass-ani...,9:09\r\n You gonna call it or am I?\r\n One m...,"['A', 'B', 'A', 'B', 'A', 'B', 'C#m', 'F#m', '...","R&b, Funk & Soul",Helium,en,9:09 You gonna call it or am I? One more ti...
29614,Tokyo Love Hotel,Rina Sawayama,https://tabs.ultimate-guitar.com/tab/rina-sawa...,(Tokyo)\r\n(Tokyo)\r\n(Tokyo) People don't kno...,"['A', 'B', 'E', 'A', 'A', 'B', 'A', 'B', 'G#m'...","R&b, Funk & Soul",Tokyo Love Hotel,en,People don't know what they're taking until...
29615,Hurt,Arlo Parks,https://tabs.ultimate-guitar.com/tab/arlo-park...,"Mmm, mmm\r\nMmm, mmm, mmm, mmm\r\nCharlie dr...","['Em', 'D', 'Bm', 'Em', 'D', 'Bm', 'Em', 'D', ...","R&b, Funk & Soul",Hurt,en,"Mmm, mmm Mmm, mmm, mmm, mmm Charlie drank it..."


### Finally, save the pre-processed data

In [52]:
all_genres_en.to_json(path_or_buf='data/all_genres_en.json')