## Pre-process scraped data

In [2]:
import pandas as pd 
from collections import Counter
import re

### 1. Construct dataframe of all songs

In [6]:
import os 
genres = ['Country', 'Jazz', 'Pop', 'Rock']
all_genres = pd.DataFrame()
for genre in genres:
    basepath = f'./scrapped_data/{genre}/'
    filepaths = [basepath + f for f in os.listdir(f'./scrapped_data/{genre}/') if f.endswith('.csv')]
    genre_df = pd.concat(map(pd.read_csv, filepaths))
    genre_df['genre'] = genre
    all_genres = pd.concat([all_genres, genre_df], ignore_index=True).drop(columns='Unnamed: 0')
all_genres

Unnamed: 0,name,author,link,lyrics,chords,genre
0,Ring Of Fire,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Love Is A Burning Thing\r\nAnd It Makes A Fier...,"['G', 'C', 'G', 'C', 'G', 'D', 'G', 'D', 'G', ...",Country
1,Leaving On A Jet Plane,John Denver,https://tabs.ultimate-guitar.com/tab/john-denv...,"All my bags are packed, I'm ready to go\r\nI'm...","['G', 'C', 'G', 'C', 'G', 'C', 'D', 'G', 'C', ...",Country
2,Hit The Road Jack,Ray Charles,https://tabs.ultimate-guitar.com/tab/ray-charl...,Hit the road Jack and don't you come back no m...,"['Am', 'G', 'F', 'E7', 'Am', 'G', 'F', 'E7', '...",Country
3,The End Of The World,Skeeter Davis,https://tabs.ultimate-guitar.com/tab/skeeter-d...,Why does the sun go on shining\r\nWhy does the...,"['G', 'D', 'Em', 'Bm', 'Am', 'Bm', 'E7', 'Am',...",Country
4,Mama Tried,Merle Haggard,https://tabs.ultimate-guitar.com/tab/merle-hag...,The first thing I remember knowing was a lones...,"['D', 'G', 'D', 'G', 'D', 'A7', 'D', 'G', 'D',...",Country
...,...,...,...,...,...,...
25107,Fake Love Dont Last,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,"I watched a movie scene, got déjà vu\r\n'Cause...","['Em', 'G', 'B', 'C', 'Em', 'G', 'B', 'C', 'Em...",Rock
25108,Sid And Nancy,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,I knew a girl who'd wear my t-shirts when she ...,"['D', 'F#m', 'E', 'D', 'F#m', 'E', 'D', 'F#m',...",Rock
25109,Forever,Stereophonics,https://tabs.ultimate-guitar.com/tab/stereopho...,Sun beats down on my mind on a friday morning...,"['G', 'C', 'G', 'C', 'G', 'C', 'G', 'C', 'D', ...",Rock
25110,Goodnight Chicago,Rainbow Kitten Surprise,https://tabs.ultimate-guitar.com/tab/rainbow-k...,Twenty years to see New York reflected on subw...,"['C', 'C', 'Am', 'F', 'C', 'C', 'C', 'Am', 'F'...",Rock


### 2. Remove duplicates

Parse real name of a song i.e. remove the version. Notice song on index 7 and 8.

In [7]:
all_genres['actual_name'] = all_genres.name.apply(lambda name : name.split(' (ver')[0])
all_genres.head(10)

Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
0,Ring Of Fire,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Love Is A Burning Thing\r\nAnd It Makes A Fier...,"['G', 'C', 'G', 'C', 'G', 'D', 'G', 'D', 'G', ...",Country,Ring Of Fire
1,Leaving On A Jet Plane,John Denver,https://tabs.ultimate-guitar.com/tab/john-denv...,"All my bags are packed, I'm ready to go\r\nI'm...","['G', 'C', 'G', 'C', 'G', 'C', 'D', 'G', 'C', ...",Country,Leaving On A Jet Plane
2,Hit The Road Jack,Ray Charles,https://tabs.ultimate-guitar.com/tab/ray-charl...,Hit the road Jack and don't you come back no m...,"['Am', 'G', 'F', 'E7', 'Am', 'G', 'F', 'E7', '...",Country,Hit The Road Jack
3,The End Of The World,Skeeter Davis,https://tabs.ultimate-guitar.com/tab/skeeter-d...,Why does the sun go on shining\r\nWhy does the...,"['G', 'D', 'Em', 'Bm', 'Am', 'Bm', 'E7', 'Am',...",Country,The End Of The World
4,Mama Tried,Merle Haggard,https://tabs.ultimate-guitar.com/tab/merle-hag...,The first thing I remember knowing was a lones...,"['D', 'G', 'D', 'G', 'D', 'A7', 'D', 'G', 'D',...",Country,Mama Tried
5,Cocaine Blues,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,"Early one mornin' while makin' the rounds, I t...","['C', 'G', 'C', 'C', 'G', 'C', 'C', 'G', 'C', ...",Country,Cocaine Blues
6,King Of The Road (ver 4),Roger Miller,https://tabs.ultimate-guitar.com/tab/roger-mil...,"Trailers for sale or rent,\r\nRooms to let fif...","['A', 'D', 'E', 'A', 'A', 'D', 'E', 'A', 'D', ...",Country,King Of The Road
7,Jackson,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,We got married in a fever hotter than a pepper...,"['C', 'C', 'C7', 'C', 'F', 'C', 'C', 'F', 'G7'...",Country,Jackson
8,Crazy (ver 2),Patsy Cline,https://tabs.ultimate-guitar.com/tab/patsy-cli...,"Crazy, I'm crazy for feeling so lonely\r\nI'm ...","['C', 'C', 'A7', 'A7', 'Dm', 'Ebm7', 'Dm7', 'D...",Country,Crazy
9,Lay Lady Lay,Bob Dylan,https://tabs.ultimate-guitar.com/tab/bob-dylan...,"Lay lady lay, lay across my big brass bed\...","['A', 'C#m', 'G', 'Bm', 'A', 'C#m', 'G', 'Bm',...",Country,Lay Lady Lay


In [8]:
print(all_genres.actual_name.value_counts())
all_genres.loc[all_genres.actual_name.str.contains('What A Wonderful World')].head(10)

What A Wonderful World    39
My Way                    22
Hallelujah                19
Georgia On My Mind        18
The Christmas Song        18
                          ..
Country Roads              1
Life In Technicolor        1
Two Worlds Collide         1
Mojado                     1
Goodnight Chicago          1
Name: actual_name, Length: 15868, dtype: int64


Unnamed: 0,name,author,link,lyrics,chords,genre,actual_name
218,What A Wonderful World,Stacey Kent,https://tabs.ultimate-guitar.com/tab/stacey-ke...,"I see trees of green, red roses too\r\n ...","['A', 'C#m7', 'Dmaj7', 'C#m7', 'Bm7', 'A', 'C#...",Jazz,What A Wonderful World
425,What A Wonderful World (ver 9),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
430,What A Wonderful World (ver 2),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\n I...","['F', 'Am', 'Bb', 'Am', 'Gm7', 'F', 'A7', 'Dm'...",Jazz,What A Wonderful World
440,What A Wonderful World,Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\n ...","['F', 'Am', 'Bb', 'Am', 'Gm', 'F', 'A7', 'Dm',...",Jazz,What A Wonderful World
453,What A Wonderful World (ver 3),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['G', 'Bm', 'C', 'Bm', 'Am7', 'G', 'B7', 'Em',...",Jazz,What A Wonderful World
466,What A Wonderful World (ver 11),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'G', 'Am', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
470,What A Wonderful World (ver 4),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['F', 'Am', 'Bb', 'Am', 'Gm7', 'F', 'A7', 'Dm'...",Jazz,What A Wonderful World
569,What A Wonderful World,Willie Nelson,https://tabs.ultimate-guitar.com/tab/willie-ne...,"I see trees of green, red roses too\r\nI see t...","['D', 'F#m', 'G', 'F#m', 'Em', 'D', 'F#7', 'Bm...",Jazz,What A Wonderful World
771,What A Wonderful World (ver 14),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too,\r\nI see ...","['C', 'G', 'Am', 'Em', 'F', 'Em', 'Dm', 'Am', ...",Jazz,What A Wonderful World
782,What A Wonderful World (ver 7),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm7', 'C', 'E7', 'Am',...",Jazz,What A Wonderful World


Remove duplicate songs that have the same name, have same author, but in different versions.

In [9]:
all_genres_unique = all_genres.drop_duplicates(subset=['actual_name', 'author']).reset_index()
all_genres_unique

Unnamed: 0,index,name,author,link,lyrics,chords,genre,actual_name
0,0,Ring Of Fire,Johnny Cash,https://tabs.ultimate-guitar.com/tab/johnny-ca...,Love Is A Burning Thing\r\nAnd It Makes A Fier...,"['G', 'C', 'G', 'C', 'G', 'D', 'G', 'D', 'G', ...",Country,Ring Of Fire
1,1,Leaving On A Jet Plane,John Denver,https://tabs.ultimate-guitar.com/tab/john-denv...,"All my bags are packed, I'm ready to go\r\nI'm...","['G', 'C', 'G', 'C', 'G', 'C', 'D', 'G', 'C', ...",Country,Leaving On A Jet Plane
2,2,Hit The Road Jack,Ray Charles,https://tabs.ultimate-guitar.com/tab/ray-charl...,Hit the road Jack and don't you come back no m...,"['Am', 'G', 'F', 'E7', 'Am', 'G', 'F', 'E7', '...",Country,Hit The Road Jack
3,3,The End Of The World,Skeeter Davis,https://tabs.ultimate-guitar.com/tab/skeeter-d...,Why does the sun go on shining\r\nWhy does the...,"['G', 'D', 'Em', 'Bm', 'Am', 'Bm', 'E7', 'Am',...",Country,The End Of The World
4,4,Mama Tried,Merle Haggard,https://tabs.ultimate-guitar.com/tab/merle-hag...,The first thing I remember knowing was a lones...,"['D', 'G', 'D', 'G', 'D', 'A7', 'D', 'G', 'D',...",Country,Mama Tried
...,...,...,...,...,...,...,...,...
17730,25106,5150,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,Bruises don't heal overnight\r\nI'm a few sips...,"['C', 'G', 'D', 'Em', 'C', 'G', 'D', 'Em', 'C'...",Rock,5150
17731,25107,Fake Love Dont Last,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,"I watched a movie scene, got déjà vu\r\n'Cause...","['Em', 'G', 'B', 'C', 'Em', 'G', 'B', 'C', 'Em...",Rock,Fake Love Dont Last
17732,25108,Sid And Nancy,Machine Gun Kelly,https://tabs.ultimate-guitar.com/tab/machine-g...,I knew a girl who'd wear my t-shirts when she ...,"['D', 'F#m', 'E', 'D', 'F#m', 'E', 'D', 'F#m',...",Rock,Sid And Nancy
17733,25109,Forever,Stereophonics,https://tabs.ultimate-guitar.com/tab/stereopho...,Sun beats down on my mind on a friday morning...,"['G', 'C', 'G', 'C', 'G', 'C', 'G', 'C', 'D', ...",Rock,Forever


Notice some songs are performed by different artists and should also be removed based on the lyrics I guess. See example:

In [10]:
print(all_genres_unique.actual_name.value_counts())
all_genres_unique.loc[all_genres_unique.actual_name.str.contains('What a wonderful world', flags=re.IGNORECASE)].head(10)

You And I                 10
Easy                       9
Home                       9
Run                        8
Stay                       8
                          ..
Find You                   1
Tossico Indipendente       1
Puntino Intergalattico     1
Western Wind               1
Goodnight Chicago          1
Name: actual_name, Length: 15868, dtype: int64


Unnamed: 0,index,name,author,link,lyrics,chords,genre,actual_name
188,218,What A Wonderful World,Stacey Kent,https://tabs.ultimate-guitar.com/tab/stacey-ke...,"I see trees of green, red roses too\r\n ...","['A', 'C#m7', 'Dmaj7', 'C#m7', 'Bm7', 'A', 'C#...",Jazz,What A Wonderful World
388,425,What A Wonderful World (ver 9),Louis Armstrong,https://tabs.ultimate-guitar.com/tab/louis-arm...,"I see trees of green, red roses too\r\nI see t...","['C', 'Em', 'F', 'Em', 'Dm', 'C', 'E7', 'Am', ...",Jazz,What A Wonderful World
487,569,What A Wonderful World,Willie Nelson,https://tabs.ultimate-guitar.com/tab/willie-ne...,"I see trees of green, red roses too\r\nI see t...","['D', 'F#m', 'G', 'F#m', 'Em', 'D', 'F#7', 'Bm...",Jazz,What A Wonderful World
1036,1300,What A Wonderful World,Eva Cassidy,https://tabs.ultimate-guitar.com/tab/eva-cassi...,"I see trees that are green, and red roses too...","['D', 'D7M', 'G', 'D', 'Em', 'D', 'F#', 'Bm', ...",Jazz,What A Wonderful World
1858,2373,What A Wonderful World,Jon Batiste,https://tabs.ultimate-guitar.com/tab/jon-batis...,"I see trees of green, red roses too\r\nI see t...","['E', 'A', 'E', 'Asus2', 'Asus2', 'E', 'C#m', ...",Jazz,What A Wonderful World
6810,8755,Somewhere Over The Rainbow What A Wonderful World,Robin Schulz,https://tabs.ultimate-guitar.com/tab/robin-sch...,Somewhere over the rainbow\r\n Bluebirds fly\...,"['C', 'Em', 'F', 'C', 'F', 'C', 'G', 'Am', 'F'...",Pop,Somewhere Over The Rainbow What A Wonderful World
16449,22543,What A Wonderful World,Joey Ramone,https://tabs.ultimate-guitar.com/tab/joey-ramo...,"I see trees of green / Red roses, too / I see ...","['F', 'Am', 'Bb', 'Am', 'Gm', 'F', 'A7', 'Dm',...",Rock,What A Wonderful World


### 3. Remove non-English songs

In [11]:
from langdetect import detect_langs, detect

def get_lang(text):
    print(f'{text} -- {detect_langs(text)}')
    print(detect(text))
get_lang('Hello')
get_lang('hi what is up')
get_lang('bonjour comment ca va')
get_lang('hej vad gör du här ens')

Hello -- [fi:0.6208849748506334, no:0.3791150251493665]
fi
hi what is up -- [en:0.9999972728093856]
en
bonjour comment ca va -- [fr:0.9999963301771558]
fr
hej vad gör du här ens -- [sv:0.9999970564347715]
sv


In [13]:
print(all_genres_unique.iloc[0, :].lyrics[:200])

Love Is A Burning Thing
And It Makes A Fiery Ring
Bound By Wild Desire
I Fell Into A Ring Of Fire
I Fell Into A Burning Ring Of Fire
I Went Down, Down, Down
And The Flames Went Higher
The Ring 


In [15]:
detect_langs(all_genres_unique.iloc[0, :].lyrics[200:])

[en:0.9999961733585063]

In [17]:
all_genres_unique['language'] = all_genres_unique['lyrics'].apply(lambda lyrics : detect(lyrics[:10]))
all_genres_unique.head(50)

LangDetectException: No features in text.

Discover why "LangDetectException: No features in text." error is given.

In [175]:
for lyrics in all_genres_unique['lyrics']:
    try:
        language = detect(lyrics)
    except:
        language = 'error'
        print(f'This throws an error: \n{lyrics[10:]}')

This throws an error: 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         

This throws an error: 
|--------|
|--------|--------|--------|--------|--------|--------|--------|--------|
|--------|--------|--------|--------|--------|--------|--------|--------|
|--------|--------|--------|--------|--------|--------|--------|--------|
|--------|--------|--------|--------|--------|--------|--------|--------|
|--------|--------|--------|--------|--------|--------|

This throws an error: 
|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--|
|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--|
|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--|
|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--|
|--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=--+--=--=-