In [34]:
import pandas as pd

In [35]:
# Load the lyrics into dataframes
country_lyrics = pd.read_csv(r'raw_data/country_lyrics.csv')
country_lyrics.rename(columns={'Unnamed: 0' : 'Title-Artist', '0' : 'Lyrics'}, inplace=True)

hiphop_lyrics = pd.read_csv(r'raw_data/hiphop_lyrics.csv')
hiphop_lyrics.rename(columns={'Unnamed: 0' : 'Title-Artist', '0' : 'Lyrics'}, inplace=True)

pop_lyrics = pd.read_csv(r'raw_data/pop_lyrics.csv')
pop_lyrics.rename(columns={'Unnamed: 0' : 'Title-Artist', '0' : 'Lyrics'}, inplace=True)

rock_lyrics = pd.read_csv(r'raw_data/rock_lyrics.csv')
rock_lyrics.rename(columns={'Unnamed: 0' : 'Title-Artist', '0' : 'Lyrics'}, inplace=True)

In [36]:
# define the standard section headers and drop malformed lyrics
section_headers = ['Intro','Verse','Refrain','Pre-Chorus','Pre Chorus','Chorus','Post-Chorus','Post Chorus','Hooks','Riffs/Basslines','Scratches','Sampling','Bridge','Interlude','Skit','Collision','Instrumental or Solo','Instrumental','Solo','Ad-lib','Ad lib','Segue','Outro']
section_headers_str = '|'.join(section_headers)

def drop_invalid_lyrics(df, inplace=False):
    valid_rows = df[ df['Lyrics'].str.startswith('[') & df['Lyrics'].str[:20].str.contains(section_headers_str) ]

    if inplace:
        df = valid_rows
    return valid_rows

def clean_section_headers(lyrics):
    lines = lyrics.splitlines()
    cleaned_lines = []
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith('['):
            if 'Pre-Chorus' in line or 'Pre Chorus' in line:
                cleaned_lines.append('[Pre-Chorus]')
            elif 'Post-Chorus' in line or 'Post Chorus' in line:
                cleaned_lines.append('[Post-Chorus]')
            elif 'Ad-lib' in line or 'Ad lib' in line:
                cleaned_lines.append('[Ad-lib]')
            elif 'Instrumental' in line:
                cleaned_lines.append('[Instrumental]')
            elif 'Solo' in line:
                cleaned_lines.append('[Solo]')
            elif any(header in line for header in section_headers):
                header = line.split('[')[1].split(']')[0].split(':')[0].split(' ')[0]
                cleaned_lines.append('[' + header + ']')
            else: # this branch should never occur 
                cleaned_lines.append('')
        else:
            cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

In [37]:
country_lyrics_cleaned = drop_invalid_lyrics(country_lyrics)
country_lyrics_cleaned.to_csv(r'cleaned_lyrics/country_lyrics_cleaned.csv')
country_lyrics_cleaned['Lyrics'] = country_lyrics_cleaned['Lyrics'].map(clean_section_headers)
country_lyrics_cleaned.to_csv(r'cleaned_section_headers/country_lyrics_cleaned.csv')

hiphop_lyrics_cleaned = drop_invalid_lyrics(hiphop_lyrics)
hiphop_lyrics_cleaned.to_csv(r'cleaned_lyrics/hiphop_lyrics_cleaned.csv')
hiphop_lyrics_cleaned['Lyrics'] = hiphop_lyrics_cleaned['Lyrics'].map(clean_section_headers)
hiphop_lyrics_cleaned.to_csv(r'cleaned_section_headers/hiphop_lyrics_cleaned.csv')

pop_lyrics_cleaned = drop_invalid_lyrics(pop_lyrics)
pop_lyrics_cleaned.to_csv(r'cleaned_lyrics/pop_lyrics_cleaned.csv')
pop_lyrics_cleaned['Lyrics'] = pop_lyrics_cleaned['Lyrics'].map(clean_section_headers)
pop_lyrics_cleaned.to_csv(r'cleaned_section_headers/pop_lyrics_cleaned.csv')

rock_lyrics_cleaned = drop_invalid_lyrics(rock_lyrics)
rock_lyrics_cleaned.to_csv(r'cleaned_lyrics/rock_lyrics_cleaned.csv')
rock_lyrics_cleaned['Lyrics'] = rock_lyrics_cleaned['Lyrics'].map(clean_section_headers)
rock_lyrics_cleaned.to_csv(r'cleaned_section_headers/rock_lyrics_cleaned.csv')


In [38]:
print(len(country_lyrics))
print(len(hiphop_lyrics))
print(len(pop_lyrics))
print(len(rock_lyrics))

1184
1704
1338
1601


In [39]:
print(len(country_lyrics_cleaned))
print(len(hiphop_lyrics_cleaned))
print(len(pop_lyrics_cleaned))
print(len(rock_lyrics_cleaned))

937
1375
1212
1386
