In [24]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch import nn
import torchtext

In [25]:
# Load and preprocess the data
df = pd.read_csv('train.csv')

In [26]:
df.head()

Unnamed: 0,Song_Name,Lyric,Artist,Popularity,Genre
0,Sha La La La La (Come Back Home),Each man who sees you\nWants to share your lov...,Rick James,1.3,Soul Music
1,Blood On Ice,The old Crow's cry the first warning\nThe rumb...,Bathory,0.0,Metal
2,Spring and Fall: To a Young Child,Margaret are you grieving\nOver goldengrove un...,Natalie Merchant,0.0,Rock
3,O can ye sew cushions?,O can ye sew cushions and can ye sew sheets\nA...,Sarah Brightman,1.0,New Age
4,Mexico,I don't get tired\n\nI'm either cooking dope o...,Kevin Gates,0.0,Black Music


In [27]:
df.describe

<bound method NDFrame.describe of                                 Song_Name  \
0        Sha La La La La (Come Back Home)   
1                            Blood On Ice   
2       Spring and Fall: To a Young Child   
3                  O can ye sew cushions?   
4                                  Mexico   
...                                   ...   
124126        Blame It (featuring T-Pain)   
124127        Daughters Of The Soho Riots   
124128                   What do You Want   
124129       I wish you wouldn't say that   
124130                                ABC   

                                                    Lyric            Artist  \
0       Each man who sees you\nWants to share your lov...        Rick James   
2       Margaret are you grieving\nOver goldengrove un...  Natalie Merchant   
3       O can ye sew cushions and can ye sew sheets\nA...   Sarah Brightman   
4       I don't get tired\n\nI'm either cooking dope o...       Kevin Gates   
...                           

In [28]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
lyrics = []
for lyric in df['Lyric']:
    tokenized_lyric = tokenizer(lyric)
    lyrics.append(tokenized_lyric)
artists = df['Artist'].tolist()
genres = df['Genre'].tolist()


In [29]:
# Encode categorical features
artist_encoder = LabelEncoder()
genre_encoder = LabelEncoder()
artist_encoded = artist_encoder.fit_transform(artists)
genre_encoded = genre_encoder.fit_transform(genres)

In [30]:
# Create a vocabulary and encode tokens
vocab = set(token for lyric in lyrics for token in lyric)
vocab_to_idx = {token: idx+1 for idx, token in enumerate(vocab)}

lyrics_encoded = [[vocab_to_idx[token] for token in lyric] for lyric in lyrics]

In [31]:
# Pad sequences to a fixed length
max_seq_length = 250
lyrics_padded = [torch.LongTensor(seq[:max_seq_length]) if len(seq) > max_seq_length else torch.LongTensor(seq) for seq in lyrics_encoded]
X = torch.nn.utils.rnn.pad_sequence(lyrics_padded, batch_first=True).transpose(1,0)

In [32]:
# Convert data to PyTorch tensors
y_artist = torch.LongTensor(artist_encoded)
y_genre = torch.LongTensor(genre_encoded)

In [33]:
# Define a PyTorch dataset
class SongLyricsDataset(Dataset):
    def __init__(self, lyrics, artists, genres):
        self.lyrics = lyrics
        self.artists = artists
        self.genres = genres

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        return self.lyrics[idx], self.artists[idx], self.genres[idx]

In [34]:
# Split data into training and test sets
num_samples = len(X)
split_ratio = 0.8
split_idx = int(num_samples*split_ratio)

train_dataset = SongLyricsDataset(X[:split_idx], y_artist[:split_idx], y_genre[:split_idx])
test_dataset = SongLyricsDataset(X[split_idx:], y_artist[split_idx:], y_genre[split_idx:])

In [35]:
len(train_dataset), len(test_dataset)

(200, 50)

In [36]:
# Define a PyTorch model
class LyricsClassifier(nn.Module):
    def __init__(self, vocab_size, num_artists, num_genres, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc_artist = nn.Linear(hidden_size, num_artists)
        self.fc_genre = nn.Linear(hidden_size, num_genres)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.gru(x)
        h = h.squeeze(0)
        artist_logits = self.fc_artist(h)
        genre_logits = self.fc_genre(h)
        return artist_logits, genre_logits

In [37]:
# Define hyperparameters
batch_size = 1
learning_rate = 0.1

In [38]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [39]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [40]:
len(train_loader), len(test_loader)

(200, 50)

In [41]:
X.shape

torch.Size([250, 124131])

In [42]:
vocab_size = len(vocab_to_idx) + 1

In [43]:
# Initialize the model and optimizer
input_size = len(vocab_to_idx) + 1 # add 1 for padding token
num_artists = len(artist_encoder.classes_)
num_genres = len(genre_encoder.classes_)
hidden_size = 128

model_0 = LyricsClassifier(input_size, num_artists, num_genres, hidden_size)
optimizer = torch.optim.Adam(model_0.parameters(), lr=learning_rate)

model_0

LyricsClassifier(
  (embedding): Embedding(225671, 128)
  (gru): GRU(128, 128, batch_first=True)
  (fc_artist): Linear(in_features=128, out_features=2392, bias=True)
  (fc_genre): Linear(in_features=128, out_features=72, bias=True)
)

In [44]:
# Define the loss function
loss_fn = nn.CrossEntropyLoss()

In [48]:
num_epochs = 3

for epoch in range(num_epochs):
    model_0.train()
    train_loss = 0
    train_acc_artist = 0
    train_acc_genre = 0
    num_train_samples = 0
    
    for i, (x, y_artist_true, y_genre_true) in enumerate(train_dataset):
        x = x.to(device)
        y_artist_true = y_artist_true.to(device)
        y_genre_true = y_genre_true.to(device)

        optimizer.zero_grad()

        y_artist_pred, y_genre_pred = model_0(x)
        loss_artist = loss_fn(y_artist_pred, y_artist_true)
        loss_genre = loss_fn(y_genre_pred, y_genre_true)
        loss = loss_artist + loss_genre

        loss.backward()
        optimizer.step()

        with torch.no_grad():
            train_loss += loss.item()
            train_acc_artist += (y_artist_pred.argmax(dim=-1) == y_artist_true).sum().item()
            train_acc_genre += (y_genre_pred.argmax(dim=-1) == y_genre_true).sum().item()
            num_train_samples += len(x)

    train_loss /= num_train_samples
    train_acc_artist /= num_train_samples
    train_acc_genre /= num_train_samples

    # Evaluate the model
    model_0.eval()
    test_loss = 0
    test_acc_artist = 0
    test_acc_genre = 0
    num_test_samples = 0

    for i, (x, y_artist_true, y_genre_true) in enumerate(test_dataset):
        x = x.to(device)
        y_artist_true = y_artist_true.to(device)
        y_genre_true = y_genre_true.to(device)

        y_artist_pred, y_genre_pred = model_0(x)
        loss_artist = loss_fn(y_artist_pred, y_artist_true)
        loss_genre = loss_fn(y_genre_pred, y_genre_true)
        loss = loss_artist + loss_genre

        with torch.no_grad():
            test_loss += loss.item()
            test_acc_artist += (y_artist_pred.argmax(dim=-1) == y_artist_true).sum().item()
            test_acc_genre += (y_genre_pred.argmax(dim=-1) == y_genre_true).sum().item()
            num_test_samples += len(x)

    test_loss /= num_test_samples
    test_acc_artist /= num_test_samples
    test_acc_genre /= num_test_samples

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Artist Acc: {train_acc_artist:.4f}, Train Genre Acc: {train_acc_genre:.4f}, Test Loss: {test_loss:.4f}, Test Artist Acc: {test_acc_artist:.4f}, Test Genre Acc: {test_acc_genre:.4f}')

KeyboardInterrupt: ignored

In [None]:
#Define a function to predict artist and genre from lyrics

def predict_lyrics(model, vocab_to_idx, artist_encoder, genre_encoder, lyrics):
    model.eval()
    tokens = tokenizer(lyrics)
    encoded = [[vocab_to_idx.get(token, 0) for token in tokens]]
    padded = [torch.LongTensor(seq[:max_seq_length] + [0]*(max_seq_length-len(seq))) for seq in encoded]
    x = pad_sequence(padded, batch_first=True)
    artist_logits, genre_logits = model(x.to(device))
    artist_pred = artist_encoder.inverse_transform(artist_logits.argmax(dim=-1).cpu().numpy())[0]
    genre_pred = genre_encoder.inverse_transform(genre_logits.argmax(dim=-1).cpu().numpy())[0]
    return artist_pred, genre_pred


In [None]:

lyrics = ''
artist_pred, genre_pred = predict_lyrics(model_0, vocab_to_idx, artist_encoder, genre_encoder, lyrics)
print(f"Predicted Artist: {artist_pred}, Predicted Genre: {genre_pred}")