# Style-extraction using a Classification Transformer

In [1]:
import json
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
data = pd.read_csv('../Data/data_tokenized_pitch_class.csv')
data["chords"] = data["chords"].apply(lambda x: torch.tensor(json.loads(x)))
data.head()

Unnamed: 0,url,title,artist,decade,genre,ratings,stars,chords
0,https://tabs.ultimate-guitar.com/tab/jeff-buck...,Hallelujah,Jeff Buckley,1990,Rock|Folk,51639.0,5.0,"[tensor(685), tensor(677), tensor(685), tensor..."
1,https://tabs.ultimate-guitar.com/tab/ed-sheera...,Perfect,Ed Sheeran,2010,Pop,44194.0,5.0,"[tensor(162), tensor(34), tensor(685), tensor(..."
2,https://tabs.ultimate-guitar.com/tab/elvis-pre...,Cant Help Falling In Love,Elvis Presley,1960,"Soundtrack|R&B, Funk & Soul",30059.0,5.0,"[tensor(685), tensor(162), tensor(677), tensor..."
3,https://tabs.ultimate-guitar.com/tab/eagles/ho...,Hotel California,Eagles,1970,Rock,28670.0,5.0,"[tensor(173), tensor(422), tensor(397), tensor..."
4,https://tabs.ultimate-guitar.com/tab/radiohead...,Creep,Radiohead,1990,Rock,28606.0,5.0,"[tensor(162), tensor(83), tensor(685), tensor(..."


In [3]:
augmentation_map = torch.tensor(np.load('../Data/augmentation_map.npy', allow_pickle=True))

with open("../Data/token_to_chord.json", "r") as f:
    token_to_chord = json.load(f)
# Convert the dictionary keys to integers
token_to_chord = {int(k): v for k, v in token_to_chord.items()}

# Start and end of sequence tokens are not needed
VOCAB_SIZE = len(token_to_chord)
VOCAB_SIZE

1033

## Dataset

In [4]:
class ChordDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.move_by = 0
        
    def __len__(self):
        return len(self.data)
    
    def augment(self, chords, move_by):
        """Change the root note of the chords by a random amount"""
        return augmentation_map[chords, move_by]
        
    def pad(self, chords):
        """Pad the input tensor of shape [n] into shape [256] with zeros"""
        out = torch.zeros((256))
        out[:len(chords)] = chords
        return out

    def __getitem__(self, idx):
        x = self.data.iloc[idx]["chords"]
        x = self.augment(x, self.move_by)
        return self.pad(x).long()

In [5]:
torch.manual_seed(42)

dataset = ChordDataset(data)
# We need the data to remain in the same order
dataloader = DataLoader(dataset, batch_size=128, shuffle=False)

## Model

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.multi_head_attention = nn.MultiheadAttention(
            embed_dim=d_model, num_heads=n_heads, batch_first=True
        )
        self.dropout1 = nn.Dropout(0.1)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model * 4), nn.ReLU(), nn.Linear(d_model * 4, d_model)
        )
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        # Multi-head attention
        att_input = self.layer_norm1(x)
        att_output = self.multi_head_attention(att_input, att_input, att_input, attn_mask=mask, need_weights=False)[0]
        x = x + self.dropout1(att_output)

        # Feed forward
        ff_input = self.layer_norm2(x)
        ff_output = self.feed_forward(ff_input)
        x = x + self.dropout2(ff_output)
        
        return x 


class ClassificationTransformer(nn.Module):
    def __init__(self, d_model, n_heads, n_layers, input_len, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, d_model)
        self.transformer_blocks = nn.ModuleList([TransformerBlock(d_model, n_heads) for _ in range(n_layers)])
        self.output = nn.Linear(d_model, output_dim)

        # Positional encoding
        self.register_buffer('pe', torch.zeros(input_len, d_model))
        self.pe = torch.zeros(input_len, d_model)
        pos = torch.arange(0, input_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
        )
        self.pe[:, 0::2] = torch.sin(pos * div_term)
        self.pe[:, 1::2] = torch.cos(pos * div_term)
        
    def generate_square_subsequent_mask(self, size):
        """Generate a boolean mask to avoid attending to future tokens."""
        mask = torch.triu(torch.ones(size, size), diagonal=1).bool()
        return mask

    def forward(self, x):
        x = self.embedding(x)
        x = x + self.pe

        # Generate mask
        mask = self.generate_square_subsequent_mask(x.size(1)).to(x.device)

        # Passing through all transformer blocks
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x, mask)

        x = x.mean(dim=1)

        # For feature extraction, we don't need the output layer
        # x = self.output(x)
        return x

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Style Extraction

Let's extract the style of the entire dataset with each of the classification models. The mean value across the transpositions will be used, as we do not want to insert any information about the key into the style. We will save these styles as a new column in our data DataFrame.

In [8]:
specs = [
    {"size": "S", "d_model": 64, "n_heads": 8},
    {"size": "M", "d_model": 80, "n_heads": 10},
    {"size": "L", "d_model": 96, "n_heads": 12},
]

for spec in specs:
    print(f"Running with model size {spec['size']}...\n------------------")
    classification_transformer = ClassificationTransformer(
        d_model=spec["d_model"],
        n_heads=spec["n_heads"],
        n_layers=6,
        input_len=256,
        output_dim=28
    ).to(device)

    classification_transformer.load_state_dict(torch.load(f"../Models/ClassificationTransformer{spec['size']}.pt"))

    all_styles = []
    for transpose in range(12):
        print(f"Extracting style transposed by {transpose} semitones...")
        dataset.move_by = transpose

        styles = []
        for i, x in enumerate(dataloader):
            classification_transformer.eval()
            with torch.inference_mode():
                styles.append(classification_transformer(x.to(device)))
        styles = torch.cat(styles).cpu().detach().numpy()
        all_styles.append(styles)
    # Get the mean of all styles
    data[f"style_{spec['size'].lower()}"] = np.stack(all_styles).mean(axis=0).tolist()
    
    # Convert to JSON
    data[f"style_{spec['size'].lower()}"] = data[f"style_{spec['size'].lower()}"].apply(lambda x: json.dumps(x))

# Convert to JSON
data["chords"] = data["chords"].apply(lambda x: json.dumps(x.tolist()))

# Save to CSV
data.to_csv("../Data/data_styled.csv", index=False)

Running with model size S...
------------------
Extracting style transposed by 0 semitones...
Extracting style transposed by 1 semitones...
Extracting style transposed by 2 semitones...
Extracting style transposed by 3 semitones...
Extracting style transposed by 4 semitones...
Extracting style transposed by 5 semitones...
Extracting style transposed by 6 semitones...
Extracting style transposed by 7 semitones...
Extracting style transposed by 8 semitones...
Extracting style transposed by 9 semitones...
Extracting style transposed by 10 semitones...
Extracting style transposed by 11 semitones...
Running with model size M...
------------------
Extracting style transposed by 0 semitones...
Extracting style transposed by 1 semitones...
Extracting style transposed by 2 semitones...
Extracting style transposed by 3 semitones...
Extracting style transposed by 4 semitones...
Extracting style transposed by 5 semitones...
Extracting style transposed by 6 semitones...
Extracting style transposed 