# Blog Post 11: Deep Music Genre Classification
Well, I've been hoping to do a deep learning blog post all semester, so I'm really excited to give this one a shot! The first step will be to properly format the data, and then perform text vectorization, since lyrics are words, which cannot function as features. Then, I will collate batches. Finally, I will be prepared to create a model and evaluate its accuracy. I will create 3 models, one learning from just the lyrics, one learning from just the features, and one learning from both. Once the models have successfully performed better than the base rate, I will compare the accuracy across the three models to see which one would be the most promising to continue to work with.

In [128]:
import pandas as pd

url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

In [129]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [130]:
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy'] 

In [131]:
print(df["genre"].unique())

genres = {'pop': 0, 'country': 1, 'blues': 2, 'jazz': 3, 'reggae': 4, 'rock': 5, 'hip hop': 6}
df["genre"] = df["genre"].apply(genres.get)

# check to make sure all the genres were successfully converted to numbers
print(df["genre"].unique())

df.head(3)

['pop' 'country' 'blues' 'jazz' 'reggae' 'rock' 'hip hop']
[0 1 2 3 4 5 6]


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,0,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,0,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,0,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0


In [132]:
df.groupby("genre").size() / len(df)

genre
0    0.248202
1    0.191915
2    0.162273
3    0.135521
4    0.088045
5    0.142182
6    0.031862
dtype: float64

Looks like if we guess genre 0 (pop) every time, we will get a base accuracy of 24.8%. Let's see if we an do better than this.

In [133]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        target = self.df['genre'].iloc[index]
        lyrics = self.df['lyrics'].iloc[index]
        features = self.df[engineered_features].iloc[index]
        return target, lyrics, features

    def __len__(self):
        return len(self.df)                

In [134]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [135]:
train_data[193]

(2,
 'walk talk breathe try smile death finger pulse time time rest simple proposal walk line try borrow time moral story limit crew gonna choose tomorrow scene easy turn money green promise future wrong line walk keep take long rainbow morning follow gonna live tomorrow like live today borrow look stay',
 dating                      0.001224
 violence                    0.115240
 world/life                  0.354443
 night/time                  0.170155
 shake the audience          0.001224
 family/gospel               0.001224
 romantic                    0.001224
 communication               0.001224
 obscene                     0.001224
 music                       0.001224
 movement/places             0.255235
 light/visual perceptions    0.001224
 family/spiritual            0.001224
 like/girls                  0.041008
 sadness                     0.001224
 feelings                    0.001224
 danceability                0.352323
 loudness                    0.597134
 acoustic

In [136]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

tokenized = tokenizer(train_data[193][1])
tokenized

['walk',
 'talk',
 'breathe',
 'try',
 'smile',
 'death',
 'finger',
 'pulse',
 'time',
 'time',
 'rest',
 'simple',
 'proposal',
 'walk',
 'line',
 'try',
 'borrow',
 'time',
 'moral',
 'story',
 'limit',
 'crew',
 'gonna',
 'choose',
 'tomorrow',
 'scene',
 'easy',
 'turn',
 'money',
 'green',
 'promise',
 'future',
 'wrong',
 'line',
 'walk',
 'keep',
 'take',
 'long',
 'rainbow',
 'morning',
 'follow',
 'gonna',
 'live',
 'tomorrow',
 'like',
 'live',
 'today',
 'borrow',
 'look',
 'stay']

In [137]:
def yield_tokens(data_iter):
    for target, text, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"], min_freq = 50)
vocab.set_default_index(vocab["<unk>"])

In [138]:
import torch

max_len = 30
num_tokens = len(vocab.get_itos())
def text_pipeline(x):
    tokens = vocab(tokenizer(x))
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

label_pipeline = lambda x: int(x)

In [139]:
text_pipeline("we cant believe")

tensor([   0,    0,   44, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897,
        2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897,
        2897, 2897, 2897, 2897, 2897, 2897])

In [140]:
import numpy as np

def collate_batch(batch):
    target_label_list = []
    lyric_text_list = []
    features_list = []  

    for (_targets, _lyrics, _features) in batch:
        # process targets
        target_label_list.append(label_pipeline(_targets))

        # process lyrics
        processed_lyrics = text_pipeline(_lyrics)
        lyric_text_list.append(processed_lyrics)

        # process features
        features_list.append(_features.to_numpy())

    target_label_list = torch.tensor(target_label_list, dtype=(torch.int64))
    lyric_text_list = torch.stack(lyric_text_list)
    features_list = torch.tensor((features_list), dtype=torch.float64)

    return target_label_list, lyric_text_list, features_list

In [141]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [142]:
next(iter(train_loader))

(tensor([0, 0, 2, 0, 1, 6, 4, 2]),
 tensor([[  35,    1,  108,   92,    1,   62,    0,  295,  604,    2,  481,   35,
            77,    9,   14,  228,  246,  276,   35,   35,   58,   58,   58,   58,
           127,   96,   35,   81,   32,   25],
         [1264, 2779,   17,    0,    0,    0,  281,  830,  218,    0,   74,  167,
           862, 1878, 1762, 1117, 1454,  256,    0,   55,  154,  154,    0, 1739,
             0,    0,    0, 1473,  392,  496],
         [ 815,  439,  717,   29, 1252, 2876,  147,    0,  140,   84, 1222,  311,
          1669, 1295,  815,  439,  717,   29, 1252, 2876,  147,    0,  140,   84,
          1222,  311, 1669, 1295, 2897, 2897],
         [   0,  382,    0,    0,    0, 1963,  139,    0, 1942,    0, 1097, 2897,
          2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897, 2897,
          2897, 2897, 2897, 2897, 2897, 2897],
         [   0,   33,    0,    0,   82, 2469,    0,   81,  160,  170,  257,    0,
           259,   33,   77, 2091,  527,

In [154]:
from torch import nn

class LyricClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len, num_class, dropout_p):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(max_len*embedding_dim, num_class)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.dropout(x)
        return(x)

In [155]:
vocab_size = len(vocab)
embedding_dim = 3
num_class = 7
dropout_p = 0.2
model = LyricClassificationModel(vocab_size, embedding_dim, max_len, num_class, dropout_p)

In [156]:
import time

optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
loss_fn = torch.nn.CrossEntropyLoss()

def train(dataloader, d_type):
    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    log_interval = 300
    start_time = time.time()

    for idx, (label, text, features) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()

        if d_type == 'lyrics':
            input_data = text
        elif d_type == 'features':
            input_data = features[idx]
        else:  # 'both'
            input_data = (text, features[idx])

        # form prediction on batch
        predicted_label = model(input_data)

        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)
        # compute gradient
        loss.backward()
        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    
def evaluate(dataloader, d_type):

    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, features) in enumerate(dataloader):

            if d_type == 'lyrics':
                input_data = text
            elif d_type == 'features':
                input_data = features[idx]
            else:  # 'both'
                input_data = (text, features[idx])

            predicted_label = model(input_data)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [157]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader, 'lyrics')

| epoch   1 | train accuracy    0.204 | time: 44.38s
| epoch   2 | train accuracy    0.231 | time: 45.90s
| epoch   3 | train accuracy    0.261 | time: 48.69s
| epoch   4 | train accuracy    0.276 | time: 44.80s
| epoch   5 | train accuracy    0.281 | time: 47.53s
| epoch   6 | train accuracy    0.292 | time: 44.49s
| epoch   7 | train accuracy    0.290 | time: 42.61s
| epoch   8 | train accuracy    0.300 | time: 42.15s
| epoch   9 | train accuracy    0.296 | time: 47.53s
| epoch  10 | train accuracy    0.301 | time: 50.76s
| epoch  11 | train accuracy    0.305 | time: 51.45s
| epoch  12 | train accuracy    0.300 | time: 49.81s
| epoch  13 | train accuracy    0.302 | time: 49.77s
| epoch  14 | train accuracy    0.300 | time: 49.35s
| epoch  15 | train accuracy    0.302 | time: 48.89s
| epoch  16 | train accuracy    0.300 | time: 45.99s
| epoch  17 | train accuracy    0.299 | time: 44.35s
| epoch  18 | train accuracy    0.303 | time: 44.38s
| epoch  19 | train accuracy    0.303 | time: 

In [159]:
evaluate(val_loader, 'lyrics')

0.25127753303964756

In [160]:
from torch import nn

class FeaturesClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len, num_class, dropout_p):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(max_len*embedding_dim, num_class)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.dropout(x)
        return(x)

In [161]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader, 'features')

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.DoubleTensor instead (while checking arguments for embedding)

: 

In [None]:
evaluate(val_loader, 'features')

In [None]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader, 'both')

In [None]:
evaluate(val_loader, 'both')