In [56]:
import pandas as pd
import numpy as np
import nltk

import re
import math
import statistics

In [57]:
df = pd.read_csv('combined_output_Nov22.csv')[['artist', 'song', 'majority_genre', 'lyrics']]

In [58]:
df['majority_genre'].value_counts()

Rock          539
Pop           160
Metal          88
Electronic     66
RnB            52
Rap            52
Country        48
Jazz           35
Punk           30
Folk           29
Latin          18
Reggae         18
Blues          15
World           5
New Age         3
Name: majority_genre, dtype: int64

In [59]:
df = df[~df['majority_genre'].isin(['New Age', 'World', 'Blues', 'Latin', 'Reggae', 'Folk'])]
df['majority_genre'].value_counts()

Rock          539
Pop           160
Metal          88
Electronic     66
RnB            52
Rap            52
Country        48
Jazz           35
Punk           30
Name: majority_genre, dtype: int64

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], df['majority_genre'], stratify=df['majority_genre'], test_size=.5, random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify=y_test, test_size=.5, random_state=1)
y_train.value_counts()

Rock          269
Pop            80
Metal          44
Electronic     33
RnB            26
Rap            26
Country        24
Jazz           18
Punk           15
Name: majority_genre, dtype: int64

In [61]:
train = pd.concat({'lyrics': X_train, 'genre': y_train}, axis=1)
test = pd.concat({'lyrics': X_test, 'genre': y_test}, axis=1)
val = pd.concat({'lyrics': X_val, 'genre': y_val}, axis=1)

In [62]:
train.genre.value_counts()

Rock          269
Pop            80
Metal          44
Electronic     33
RnB            26
Rap            26
Country        24
Jazz           18
Punk           15
Name: genre, dtype: int64

In [63]:
from sklearn.utils import resample

train_sampled = pd.DataFrame(columns=['lyrics', 'genre'])
target_size = (train[train.genre=='Rock']).shape[0]
train_sampled = pd.concat([train_sampled, train[train.genre=='Rock']])
for genre in ['Pop', 'Metal', 'Electronic', 'RnB', 'Rap', 'Country', 'Jazz', 'Punk']:
    genre_sampled = resample(train[train.genre==genre], replace=True, n_samples=target_size, random_state=42)
    train_sampled = pd.concat([train_sampled, genre_sampled])

train_sampled = train_sampled.sample(frac=1)
train_sampled

Unnamed: 0,lyrics,genre
994,La la la la la lee la la la\nFine night tonigh...,Electronic
145,"Mary Lou, Mary LouI can't go on living without...",Electronic
59,"OI oi oi poloi, bags of noise, wake up, shake ...",Rap
803,You won't think twice when you realize\nSee wh...,Punk
933,You're comin' tonight\r\nTo my home town\r\nI'...,Pop
...,...,...
612,"Lubna & Tricky:\r\nI realize, there's no compr...",Electronic
658,"Radio killa killa, you know how we do\r\nAh yo...",Rap
1143,"[Eminem]\r\nWhatever..\r\nDre, just let it run...",Rap
787,That innocent look in her green eyes\nSparkle ...,Country


In [64]:
from textattack.augmentation import EmbeddingAugmenter
aug = EmbeddingAugmenter()
aug.augment('baby baby baby oooh')

from textaugment import Wordnet
t = Wordnet()

['baby baby baby ohh']

In [65]:
import mapply
mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=10,
    progressbar=True
)

In [81]:
train_augmented = pd.DataFrame(columns=['lyrics', 'genre'])
target_size = 300

for genre in ['Rock', 'Pop', 'Metal', 'Electronic', 'RnB', 'Rap', 'Country', 'Jazz', 'Punk']:
    gold_sample = train[train.genre==genre]
    sample_size = target_size - gold_sample.shape[0]
    augmented_sample = resample(gold_sample, n_samples=sample_size, random_state=43)
    # augmented_sample = augmented_sample.mapply(lambda x: )
    augmented_sample['lyrics'] = augmented_sample.lyrics.apply(lambda x: t.augment(x))
    train_augmented = pd.concat([train_augmented, gold_sample, augmented_sample])

train_augmented = train_augmented.sample(frac=1)
train_augmented

Unnamed: 0,lyrics,genre
553,how here's a little story - i've begin to tell...,Rock
852,notice me lead my hand why represent we strang...,Pop
830,j. ingram/t. snider i insure you reckon and i ...,Country
1000,(michael mcdonald) who would sell their soul f...,Country
555,there's a place in your heart and i know that ...,RnB
...,...,...
24,"Well if you don't like it\r\nGo ahead, find yo...",Rock
687,If you've been hiding from love\r\nIf you've b...,Pop
26,"The more I see you, the more I want you\nSomeh...",Jazz
641,"yesterday i witness the sun shinin', and the l...",Jazz


In [82]:
train.to_csv('genre_train.csv', index=False)
test.to_csv('genre_test.csv', index=False)
val.to_csv('genre_val.csv', index=False)
train.to_csv('genre_train.csv', index=False)
train_sampled.to_csv('genre_train_sampled.csv', index=False)
train_augmented.to_csv('genre_train_augmented.csv', index=False)

In [157]:
def normalize_stanzas(row):
    markers = [len(match) for match in re.findall(r'\r?\n+', row.lyrics)]
    stanza_mean = statistics.mean(markers) if len(markers) > 0 else 1
    norm_lyrics = re.sub('\r?\n{1,' + str(math.floor(stanza_mean)) + '}', '\n', row.lyrics)
    norm_lyrics = re.sub('\r?\n{' + str(math.floor(stanza_mean) + 1) + ', }', '\n\n', norm_lyrics)

    stanza_lens = []
    line_lens = []
    for stanza in norm_lyrics.split('\n\n'):
        lines = stanza.split('\n')
        stanza_lens.append(len(lines))
        for line in lines:
            line_lens.append(len(line.split()))
    return norm_lyrics, statistics.mean(stanza_lens), statistics.mean(line_lens)

normalize_stanzas(train.iloc[23])

def stanzitize(data):
    genres, lyrics, avg_stanza_lens, avg_line_lens = [], [], [], []
    for _, row in data.iterrows():
        lyric, avg_stanza_len, avg_line_len = normalize_stanzas(row)
        genres.append(row.genre)
        lyrics.append(lyric)
        avg_stanza_lens.append(avg_stanza_len)
        avg_line_lens.append(avg_line_len)
    return pd.DataFrame({
        'genre': genres,
        'lyrics': lyrics,
        'avg_stanza_len': avg_stanza_lens,
        'avg_line_len': avg_line_lens
    })

stanzitize(train)

Unnamed: 0,genre,lyrics,avg_stanza_len,avg_line_len
0,Metal,"""Don`t care. It won`t change anything.\nPerhap...",5.000000,5.120000
1,Rock,"Come on, let's waste another thousand years\nS...",7.000000,6.285714
2,Pop,"First Floor (Uh, Oh) \nRoom Sixteen (Uh, Oh) \...",4.647059,5.367089
3,Rock,"Revenge, smarter than the radio,\nBetter than ...",4.500000,5.777778
4,Electronic,Monochromatic reasoning\nA divide and conquer ...,3.800000,4.842105
...,...,...,...,...
530,Rock,choose from any number of magazines\nwho do yo...,4.666667,5.321429
531,Rock,"Well, the look on the cake\nIt ain't always th...",6.600000,4.757576
532,Metal,The eyes of the patriot fixed through the scop...,6.666667,7.150000
533,Pop,I turn back time and now I'm back again\nThe w...,6.125000,7.000000


In [158]:
train_s = stanzitize(train)
test_s = stanzitize(test)
val_s = stanzitize(val)
train_sampled_s = stanzitize(train_sampled)
train_augmented_s = stanzitize(train_augmented)

train_s.to_csv('genre_train_s.csv', index=False)
test_s.to_csv('genre_test_s.csv', index=False)
val_s.to_csv('genre_val_s.csv', index=False)
train_sampled_s.to_csv('genre_train_sampled_s.csv', index=False)
train_augmented_s.to_csv('genre_train_augmented_s.csv', index=False)