In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [3]:
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy'] 

In [4]:
print(df["genre"].unique())

genres = {'pop': 0, 'country': 1, 'blues': 2, 'jazz': 3, 'reggae': 4, 'rock': 5, 'hip hop': 6}
df["genre"] = df["genre"].apply(genres.get)

# check to make sure all the genres were successfully converted to numbers
print(df["genre"].unique())

df.head(3)

['pop' 'country' 'blues' 'jazz' 'reggae' 'rock' 'hip hop']
[0 1 2 3 4 5 6]


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,0,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,0,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,0,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0


In [5]:
df.groupby("genre").size() / len(df)

genre
0    0.248202
1    0.191915
2    0.162273
3    0.135521
4    0.088045
5    0.142182
6    0.031862
dtype: float64

In [6]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        lyrics = self.df['lyrics'].iloc[index]  # replace 'lyrics' with your actual column name
        features = self.df[engineered_features].iloc[index]  # replace with your actual feature column names
        return lyrics, features

    def __len__(self):
        return len(self.df)                

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, shuffle = True, test_size = 0.2)
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

In [8]:
train_data[194]

('jarabi kèlèla furunyògòn kèlèla bolo bolo cèkisè bolo dusu kolo diminna jarabi kanu jarabi kanu hodounannnn dusu kolo diminna jarabi kanu jarabi kanu hodounannnn bolo bolo cèkisè bolo bolo bolo bolo bolo cèkisè bolo bolo bolo bolo bolo cèkisè bolo kunkolo bolo argue argue husband hand longer touch neck hand longer touch waist hand longer stroke hair longer look heart ache darling follow hand longer touch neck hand longer touch waist hand longer touch breast hand longer caress head heart ache hand longer caress breast hand longer touch waist hand longer caress hair hand longer touch neck hand longer touch',
 dating                      0.001170
 violence                    0.167977
 world/life                  0.001170
 night/time                  0.001170
 shake the audience          0.001170
 family/gospel               0.299718
 romantic                    0.513591
 communication               0.001170
 obscene                     0.001170
 music                       0.001170
 mov

In [9]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

tokenized = tokenizer(train_data[194][0])
tokenized

['jarabi',
 'kèlèla',
 'furunyògòn',
 'kèlèla',
 'bolo',
 'bolo',
 'cèkisè',
 'bolo',
 'dusu',
 'kolo',
 'diminna',
 'jarabi',
 'kanu',
 'jarabi',
 'kanu',
 'hodounannnn',
 'dusu',
 'kolo',
 'diminna',
 'jarabi',
 'kanu',
 'jarabi',
 'kanu',
 'hodounannnn',
 'bolo',
 'bolo',
 'cèkisè',
 'bolo',
 'bolo',
 'bolo',
 'bolo',
 'bolo',
 'cèkisè',
 'bolo',
 'bolo',
 'bolo',
 'bolo',
 'bolo',
 'cèkisè',
 'bolo',
 'kunkolo',
 'bolo',
 'argue',
 'argue',
 'husband',
 'hand',
 'longer',
 'touch',
 'neck',
 'hand',
 'longer',
 'touch',
 'waist',
 'hand',
 'longer',
 'stroke',
 'hair',
 'longer',
 'look',
 'heart',
 'ache',
 'darling',
 'follow',
 'hand',
 'longer',
 'touch',
 'neck',
 'hand',
 'longer',
 'touch',
 'waist',
 'hand',
 'longer',
 'touch',
 'breast',
 'hand',
 'longer',
 'caress',
 'head',
 'heart',
 'ache',
 'hand',
 'longer',
 'caress',
 'breast',
 'hand',
 'longer',
 'touch',
 'waist',
 'hand',
 'longer',
 'caress',
 'hair',
 'hand',
 'longer',
 'touch',
 'neck',
 'hand',
 'longer'

In [10]:
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [11]:
import torch

max_len = 30
num_tokens = len(vocab.get_itos())
def text_pipeline(x):
    tokens = vocab(tokenizer(x))
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

label_pipeline = lambda x: int(x)

In [21]:
text_pipeline("we cant believe")
#print(tokenized)

tensor([    0, 27364,    49, 45701, 45701, 45701, 45701, 45701, 45701, 45701,
        45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701,
        45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701, 45701])

In [12]:
def collate_batch(batch):
    lyric_label_list, lyric_text_list = [], []
    features_label_list, features_text_list = [], []

    for (_lyrics, _features) in batch:
        # process lyrics
        lyric_label_list.append(label_pipeline(_lyrics))
        processed_lyrics = text_pipeline(_lyrics)
        lyric_text_list.append(processed_lyrics)

        # process features
        features_label_list.append(label_pipeline(_features))
        processed_features = text_pipeline(_features)
        features_text_list.append(processed_features)

    lyric_label_list = torch.tensor(lyric_label_list, dtype=torch.int64)
    lyric_text_list = torch.stack(lyric_text_list)
    features_label_list = torch.tensor(features_label_list, dtype=torch.int64)
    features_text_list = torch.stack(features_text_list)
    
    return lyric_text_list, lyric_label_list, features_text_list, features_label_list

In [13]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

In [14]:
next(iter(train_loader))

ValueError: invalid literal for int() with base 10: 'know look look eye fair line hold gonna need little time fall break heart mend know gonna pull know fall scar guess scar long trust sure come know hurt hard inside arm feel ready fall break heart men

In [None]:
from torch import nn

class LyricClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.fc   = nn.Linear(max_len*embedding_dim, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return(x)

In [None]:
vocab_size = len(vocab)
embedding_dim = 3
model = LyricClassificationModel(vocab_size, embedding_dim, max_len, 7)

In [None]:
import time

optimizer = torch.optim.Adam(model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

def train(dataloader):
    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    log_interval = 300
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()
        # form prediction on batch
        predicted_label = model(text)
        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)
        # compute gradient
        loss.backward()
        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    
def evaluate(dataloader):

    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader)

| epoch   1 | train accuracy    0.177 | time: 12.85s
| epoch   2 | train accuracy    0.180 | time: 14.37s
| epoch   3 | train accuracy    0.185 | time: 13.76s
| epoch   4 | train accuracy    0.195 | time: 13.77s
| epoch   5 | train accuracy    0.209 | time: 13.88s
| epoch   6 | train accuracy    0.219 | time: 14.09s
| epoch   7 | train accuracy    0.227 | time: 14.21s
| epoch   8 | train accuracy    0.244 | time: 14.27s
| epoch   9 | train accuracy    0.254 | time: 14.24s
| epoch  10 | train accuracy    0.272 | time: 14.65s
| epoch  11 | train accuracy    0.290 | time: 14.11s
| epoch  12 | train accuracy    0.298 | time: 14.49s
| epoch  13 | train accuracy    0.306 | time: 14.43s
| epoch  14 | train accuracy    0.318 | time: 1162.26s
| epoch  15 | train accuracy    0.335 | time: 28.82s
| epoch  16 | train accuracy    0.341 | time: 31.16s
| epoch  17 | train accuracy    0.352 | time: 30.41s
| epoch  18 | train accuracy    0.366 | time: 30.41s
| epoch  19 | train accuracy    0.373 | time

In [None]:
evaluate(val_loader)

0.20052863436123347