## Pre-process scraped data

In [2]:
import pandas as pd 
from ast import literal_eval
from collections import Counter

In [25]:
Counter(all_chords).keys()

dict_keys(['Am', 'D', 'G', 'C', 'Cadd9', 'E7', 'Dm7', 'Dm7b5', 'Am/G', 'D/F#', 'G7', 'F', 'Fm', 'Em', 'Fmaj7', 'G#m', 'F#', 'C#m', 'B', 'E', 'D#', 'C5', 'E5', 'A5', 'F5', 'G5', 'A', 'E/G#', 'Amaj7', 'Am7', 'F#m', 'Dsus2', 'DM7', 'A7', 'Em7', 'G#', 'F#m11', 'Bm', 'Gm', 'Bb', 'Dm', 'Bbm', 'Cmaj7', 'Gmaj7', 'B7', 'G#7', 'C/E', 'C7/E', 'C7', 'Bbmaj7', 'Fm7', 'Cm7', 'Dbmaj7', 'Ab', 'Bbm7', 'D7', 'C#dim', 'Em7b5', 'Cdim', 'Cdim7', 'F#7', 'A/C#', 'Csus2', 'Cm', 'A6', 'C#7', 'Bm7', 'Gm7', 'Badd4', 'C/G', 'D/A', 'F#m7', 'F#m6', 'Dmaj7', 'C#+', 'C#m7', 'C#7+5', 'E6', 'Bm9', 'E6/G#', 'F#madd9', 'D5', 'F7', 'Emaj7', 'Bdim7', 'Adim'])

### 1. Construct dataframe of all songs

In [104]:
import os 
genres = ['Country', 'Jazz', 'Pop', 'Rock']
all_genres = pd.DataFrame()
for genre in genres:
    basepath = f'./scrapped_data/{genre}/'
    filepaths = [basepath + f for f in os.listdir(f'./scrapped_data/{genre}/') if f.endswith('.csv')]
    genre_df = pd.concat(map(pd.read_csv, filepaths))
    genre_df['genre'] = genre
    all_genres = pd.concat([all_genres, genre_df], ignore_index=True).drop(columns='Unnamed: 0')
all_genres

Unnamed: 0,name,author,link,lyrics,chords,genre
0,Ring Of Fire,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Love Is A Burning Thing\r\nAnd It Makes A Fier...,"['G', 'C', 'G', 'C', 'G', 'D', 'G', 'D', 'G', ...",Country
1,Leaving On A Jet Plane,John Denver,https://tabs.ultimate-guitar.com/tab/john-denv...,"All my bags are packed, I'm ready to go\r\nI'm...","['G', 'C', 'G', 'C', 'G', 'C', 'D', 'G', 'C', ...",Country
2,Hit The Road Jack,Ray Charles,https://tabs.ultimate-guitar.com/tab/ray-charl...,Hit the road Jack and don't you come back no m...,"['Am', 'G', 'F', 'E7', 'Am', 'G', 'F', 'E7', '...",Country
3,The End Of The World,Skeeter Davis,https://tabs.ultimate-guitar.com/tab/skeeter-d...,Why does the sun go on shining\r\nWhy does the...,"['G', 'D', 'Em', 'Bm', 'Am', 'Bm', 'E7', 'Am',...",Country
4,Mama Tried,Merle Haggard,https://tabs.ultimate-guitar.com/tab/merle-hag...,The first thing I remember knowing was a lones...,"['D', 'G', 'D', 'G', 'D', 'A7', 'D', 'G', 'D',...",Country
...,...,...,...,...,...,...
25107,Fake Love Dont Last,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,"I watched a movie scene, got déjà vu\r\n'Cause...","['Em', 'G', 'B', 'C', 'Em', 'G', 'B', 'C', 'Em...",Rock
25108,Sid And Nancy,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,I knew a girl who'd wear my t-shirts when she ...,"['D', 'F#m', 'E', 'D', 'F#m', 'E', 'D', 'F#m',...",Rock
25109,Forever,Stereophonics,https://tabs.ultimate-guitar.com/tab/stereopho...,Sun beats down on my mind on a friday morning...,"['G', 'C', 'G', 'C', 'G', 'C', 'G', 'C', 'D', ...",Rock
25110,Goodnight Chicago,Rainbow Kitten Surprise,https://tabs.ultimate-guitar.com/tab/rainbow-k...,Twenty years to see New York reflected on subw...,"['C', 'C', 'Am', 'F', 'C', 'C', 'C', 'Am', 'F'...",Rock


### 2. Remove duplicates

Parse real name of a song i.e. remove the version. Notice song on index 7 and 8.

In [112]:
all_genres['actual_name'] = all_genres.name.apply(lambda name : name.split(' (ver')[0])
all_genres.head(10)

Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
0,Ring Of Fire,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Love Is A Burning Thing\r\nAnd It Makes A Fier...,"['G', 'C', 'G', 'C', 'G', 'D', 'G', 'D', 'G', ...",Country,Ring Of Fire
1,Leaving On A Jet Plane,John Denver,https://tabs.ultimate-guitar.com/tab/john-denv...,"All my bags are packed, I'm ready to go\r\nI'm...","['G', 'C', 'G', 'C', 'G', 'C', 'D', 'G', 'C', ...",Country,Leaving On A Jet Plane
2,Hit The Road Jack,Ray Charles,https://tabs.ultimate-guitar.com/tab/ray-charl...,Hit the road Jack and don't you come back no m...,"['Am', 'G', 'F', 'E7', 'Am', 'G', 'F', 'E7', '...",Country,Hit The Road Jack
3,The End Of The World,Skeeter Davis,https://tabs.ultimate-guitar.com/tab/skeeter-d...,Why does the sun go on shining\r\nWhy does the...,"['G', 'D', 'Em', 'Bm', 'Am', 'Bm', 'E7', 'Am',...",Country,The End Of The World
4,Mama Tried,Merle Haggard,https://tabs.ultimate-guitar.com/tab/merle-hag...,The first thing I remember knowing was a lones...,"['D', 'G', 'D', 'G', 'D', 'A7', 'D', 'G', 'D',...",Country,Mama Tried
5,Cocaine Blues,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Early one mornin' while makin' the rounds, I t...","['C', 'G', 'C', 'C', 'G', 'C', 'C', 'G', 'C', ...",Country,Cocaine Blues
6,King Of The Road (ver 4),Roger Miller,https://tabs.ultimate-guitar.com/tab/roger-mil...,"Trailers for sale or rent,\r\nRooms to let fif...","['A', 'D', 'E', 'A', 'A', 'D', 'E', 'A', 'D', ...",Country,King Of The Road
7,Jackson,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,We got married in a fever hotter than a pepper...,"['C', 'C', 'C7', 'C', 'F', 'C', 'C', 'F', 'G7'...",Country,Jackson
8,Crazy (ver 2),Patsy Cline,https://tabs.ultimate-guitar.com/tab/patsy-cli...,"Crazy, I'm crazy for feeling so lonely\r\nI'm ...","['C', 'C', 'A7', 'A7', 'Dm', 'Ebm7', 'Dm7', 'D...",Country,Crazy
9,Lay Lady Lay,Bob Dylan,https://tabs.ultimate-guitar.com/tab/bob-dylan...,"Lay lady lay, lay across my big brass bed\...","['A', 'C#m', 'G', 'Bm', 'A', 'C#m', 'G', 'Bm',...",Country,Lay Lady Lay


Remove duplicate songs that have the same name, have same author, but in different versions.

In [113]:
print(all_genres.actual_name.value_counts())
print(all_genres.actual_name.str.contains('My Way').sum())
all_genres.loc[all_genres.actual_name.str.contains('My Way')].head(50)

What A Wonderful World    39
My Way                    22
Hallelujah                19
Georgia On My Mind        18
The Christmas Song        18
                          ..
Country Roads              1
Life In Technicolor        1
Two Worlds Collide         1
Mojado                     1
Goodnight Chicago          1
Name: actual_name, Length: 15868, dtype: int64
45


Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
426,My Way (ver 4),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"And now, the end is near,\r\nAnd so I face, th...","['C', 'Em', 'Gm', 'A7', 'Dm', 'Dm7', 'G7', 'C'...",Jazz,My Way
429,My Way,Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"And now, the end is near,\r\nAnd so I face the...","['D', 'Dmaj7', 'D7', 'B7', 'Em7', 'Em7/D', 'Em...",Jazz,My Way
458,My Way (ver 6),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,N.C. - No chords ^ = Emphasise the beats\r\n|...,"['Am7', 'C', 'C', 'Em/B', 'Gm6/Bb', 'A7', 'A7s...",Jazz,My Way
787,My Way (ver 2),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"And now, the end is near\r\nAnd so I face the ...","['F', 'Fm7', 'Cm', 'D7', 'Gm', 'Gm7', 'C7', 'F...",Jazz,My Way
1063,Stumble On My Way,Norah Jones,https://tabs.ultimate-guitar.com/tab/norah-jon...,Above the clouds\r\nI found a place\r\nWhere I...,"['Bb', 'Ebmaj7', 'Eb', 'Bb', 'Eb', 'Bb', 'Ebma...",Jazz,Stumble On My Way
1134,My Way (ver 3),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"and now, the end is near; \t\r\nand so i face ...","['C', 'Em7', 'B', 'Em7', 'Bb', 'A7', 'A', 'Dm'...",Jazz,My Way
1135,My Way (ver 5),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"And now, the end is near,\r\nAnd so I face the...","['D', 'F#m', 'F#m7', 'B7', 'Em', 'Em7', 'A7', ...",Jazz,My Way
1136,My Way (ver 7),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"And now the end is near, and so I face the fin...","['D', 'Dmaj7/C#', 'Am6/C', 'B7', 'Em', 'Em7/D'...",Jazz,My Way
1712,My Way (ver 9),Frank Sinatra,https://tabs.ultimate-guitar.com/tab/frank-sin...,"Anda now, the end is near\r\nAnd so I face the...","['D', 'F#m/C#', 'F#m7-5/C', 'B7', 'Em', 'Em/D'...",Jazz,My Way
1844,On My Way Home To You,Michael Franks,https://tabs.ultimate-guitar.com/tab/michael-f...,Lost my way again\r\nAnd it wasn't hard to do\...,"['DM7', 'Am', 'GM7', 'Gm', 'F#m', 'B7', 'Em', ...",Jazz,On My Way Home To You


Notice some songs are performed by different s