In [201]:
import pandas as pd
import numpy as np

In [202]:
#import file
f = open("../scraped_examples/" + "sheeranChordsLyrics.txt","r")
full_text = f.read()

In [203]:
full_text = full_text.replace("\\r\\n","£")
full_text = "[ch]" + full_text

In [204]:
lyric_sections = full_text.split("££")

In [205]:
tags = ["Intro", "Interlude", "Chorus", "Pre-Chorus", "Verse", "Outro"]

In [206]:
# extract lyric/chord line pairs

chord_list = []
lyric_list = []
section_list = []
i=-1
for section in lyric_sections:
    i+=1
    
    # each section can be 1: just chords, 2: tag then chord/lyric pairs, 
    # 3: no tag and one line of chords, 4: no tag & multiple lines of chords
    # chord/lyric pairs must have line breaks
    
    # check for pairs
    if section.find("[/tab]")!=-1:

        # find potential tag
        space_loc = section.find(" ")
        bracket_loc = section.find("]")
        tag_loc = min(space_loc, bracket_loc)
        tag = section[section.find("[")+1:tag_loc]

        # section has TAG
        if tag in tags:
            # print(str(i) + ": TAG " + tag)

            # remove tag
            new_section = section[section.find("£")+1:]
        
            # extract tab sections
            pair_list = new_section.split("[/tab]")
            for pair in pair_list:
                if len(pair)>0:
                    start_loc = pair.find("[tab]")+5
                    if start_loc == -1:
                        start_loc = 0
                    pound_loc = pair[start_loc:].find("£")
                    chords = pair[start_loc:pound_loc+start_loc]
                    lyrics = pair[pound_loc+start_loc+1:]

                    chords = chords.replace("[ch]","")
                    chords = chords.replace("[/ch]","")
                    chord_list += [chords]
                    lyric_list += [lyrics]

        # section has NO TAG
        else:
            # print(str(i) + ": no tag, multiple lines " + tag)
            if section.find("[tab]")!=-1: print("contains pairs")
            
    # section has no pairs 
    # else: print(str(i) + ": no pairs")

In [207]:
# pair words & chords together

pair_tuples = ()
# iterate through lyric/chord list
for line in range(len(lyric_list)):
    # split lyrics and chords on each space character
    phrase = lyric_list[line]
    # find lyric space locations
    space_locs = [0] + [i+1 for i in range(len(phrase)) if (phrase[i]==" " and phrase[i+1]!=" ")] + [len(phrase)]
    
    split_words = [phrase[space_locs[i]:space_locs[i+1]] for i in range(len(space_locs)-1)]
    split_chords = [chord_list[line][space_locs[i]:space_locs[i+1]] for i in range(len(space_locs)-1)]
    pair_tuples += tuple(zip(split_words, split_chords))

In [208]:
# trim spaces, lower, and remove commas

cleaned_pairs = [[str.strip(pair[i]).lower().replace(",","") 
                  if i==0 else str.strip(pair[i]) 
                  for i in range(len(pair))] for pair in pair_tuples]

In [209]:
# add repeats of pairs when one word has multiple chords
chord_pairs = []
for pair in cleaned_pairs:
    # find multiple chords with one word
    split_chords = pair[1].split()
    if pair[1]!='' and split_chords!=[pair[1]]:
        for chord in split_chords:
            chord_pairs.append([pair[0], chord])
    else:
        chord_pairs.append(pair)

In [210]:
# find first chord from intro

# is first chord empty
if chord_pairs[0][1]=='':
    # find first [tab] instance
    tab_loc = full_text.find("[tab]")
    # then find previous [/ch] instance (flip text then find first instance)
    ch_search_text = full_text[:tab_loc][::-1]
    ch_loc = ch_search_text.find("]hc/[")
    # extract chord just after that
    chord_backwards = ch_search_text[ch_loc+5:ch_loc+5+ch_search_text[ch_loc+5:].find("]")]
    # insert in first chord space
    chord_pairs[0][1] = chord_backwards[::-1]

In [211]:
chord_pairs

[['i', 'G'],
 ['found', ''],
 ['a', ''],
 ['love', 'G'],
 ['for', ''],
 ['me', 'Em'],
 ['darling', ''],
 ['just', ''],
 ['dive', 'C'],
 ['right', ''],
 ['in', ''],
 ['and', ''],
 ['follow', ''],
 ['my', ''],
 ['lead', 'D'],
 ['well', ''],
 ['i', ''],
 ['found', ''],
 ['a', ''],
 ['girl', 'G'],
 ['beautiful', 'Em'],
 ['and', ''],
 ['sweet', ''],
 ['i', ''],
 ['never', ''],
 ['knew', 'C'],
 ['you', ''],
 ['were', ''],
 ['the', ''],
 ['someone', ''],
 ['waiting', ''],
 ['for', ''],
 ['me', 'D'],
 ['cause', ''],
 ['we', ''],
 ['were', ''],
 ['just', ''],
 ['kids', ''],
 ['when', ''],
 ['we', ''],
 ['fell', 'G'],
 ['in', ''],
 ['love', ''],
 ['not', ''],
 ['knowing', ''],
 ['what', 'Em'],
 ['it', ''],
 ['was', ''],
 ['i', ''],
 ['will', ''],
 ['not', ''],
 ['give', 'C'],
 ['you', ''],
 ['up', ''],
 ['this', ''],
 ['ti-ime', 'G'],
 ['ti-ime', 'D'],
 ['darling', ''],
 ['just', ''],
 ['kiss', 'G'],
 ['me', ''],
 ['slow', ''],
 ['your', ''],
 ['heart', ''],
 ['is', ''],
 ['all', 'Em'],
 ['i', '

In [212]:
# CLEANING
# trim spaces
# convert to all lowercase
# remove commas and dashes?: unnecessary characters
# forward filling

In [213]:
pair_df = pd.DataFrame(chord_pairs, columns=['lyric','chord'])

In [214]:
pair_df.chord = pair_df['chord'].replace('',np.nan).ffill()

In [215]:
pair_df

Unnamed: 0,lyric,chord
0,i,G
1,found,G
2,a,G
3,love,G
4,for,G
...,...,...
281,this,C
282,you,Dsus
283,look,4
284,perfect,D
