### Model for creating pooled embeddings

In [None]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
        )
        self.activation = nn.ReLU()

    def forward(self, x):
        return self.activation(x + self.net(x))

class ResidualExpansionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.in_proj = nn.Linear(128, 512)
        self.res_blocks = nn.Sequential(
            ResidualBlock(512),
            ResidualBlock(512),
            ResidualBlock(512)
        )
        self.out_proj = nn.Linear(512, 2048)

    def forward(self, x):
        x = self.in_proj(x)
        x = self.res_blocks(x)
        return self.out_proj(x)

model = ResidualExpansionModel()
print(model(torch.randn(1, 128)).shape)   # torch.Size([1, 2048])


### Model for creating token_embeddings for all three

In [1]:
print("hello")

hello


In [2]:
import pandas as pd
import numpy as np

X = np.load('nov_data/embeds.npy')
print(type(X))
print(X.shape)
print(X[2])

<class 'numpy.ndarray'>
(66676, 128)
[155.8  20.2 156.   96.4 226.4  56.7 115.7  79.6 170.5 197.6  84.6 103.3
 192.9 165.4  19.6  40.4 157.7 108.6 192.2 163.7  57.3 199.1 101.1 105.
  84.8 118.  165.6 169.9 108.6  49.  119.4  87.  102.3  94.2  86.5 208.5
  78.6 107.1 109.5  86.9  50.9 204.2   8.   94.6 164.1 145.9  54.9 178.
  73.9 224.6 171.3  78.8  74.2 119.4 105.1  94.1 141.1  24.9  77.1 223.
 133.7  64.2  89.8 165.8 144.9  78.6 111.9 140.4 229.8  88.2 132.8 102.5
 204.2 212.2 165.1  78.3  67.7  64.2  82.2 192.9  41.5 178.8 186.3 138.6
 104.5 121.3 157.  211.4 180.6  26.3  62.6 183.5  99.6 195.4  47.  113.
  68.5   9.7 211.9 145.2 159.1  48.6 151.2 209.7  84.2 231.2 198.1 111.4
 182.7 221.1 205.4 179.   64.6 119.7 187.3  47.8 150.6 176.4  15.1 156.
 144.8  92.6   9.4  46.3 197.7 137.7 122.3 255. ]


In [None]:
Y = np.load('pooled_embeds.npy')
print(type(Y))
print(Y.shape)
print(Y[2])

<class 'numpy.ndarray'>
(66676, 1, 2048)
[[ 0.14538574  0.41333008  1.6796875  ... -0.76611328 -2.3828125
   1.09082031]]


In [None]:
Y = np.squeeze(Y)
print(Y.shape)

(66676, 2048)


In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# -------------------------
# Load your numpy arrays
# -------------------------
     # (66676, 2048)

# -------------------------
# Train-test split
# -------------------------
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, shuffle=True
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

# -------------------------
# Dataset class
# -------------------------
class EmbDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X).float()
        self.Y = torch.from_numpy(Y).float()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

train_dataset = EmbDataset(X_train, Y_train)
test_dataset  = EmbDataset(X_test,  Y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=128, shuffle=False)

# -------------------------
# Model
# -------------------------
class Block(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.fc1 = nn.Linear(d, d*2)
        self.fc2 = nn.Linear(d*2, d)
        self.ln = nn.LayerNorm(d)
        self.act = nn.GELU()

    def forward(self, x):
        residual = x
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return self.ln(x + residual)
    

class Mapper(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(128, 512),
            nn.GELU(),
            Block(512),
            Block(512),
            nn.Linear(512, 1024),
            nn.GELU(),
            Block(1024),
            nn.Linear(1024, 2048),
            nn.LayerNorm(2048)
        )

    def forward(self, x):
        return self.net(x)

# -------------------------
# Training setup
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Mapper().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

def embedding_loss(pred, target):
    cos = 1 - torch.nn.functional.cosine_similarity(pred, target).mean()
    mse = torch.nn.functional.mse_loss(pred, target)
    return 0.7*cos + 0.3*mse

# -------------------------
# Training loop
# -------------------------
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for Xb, Yb in train_loader:
        Xb, Yb = Xb.to(device), Yb.to(device)

        pred = model(Xb)
        loss = embedding_loss(pred, Yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {total_loss/len(train_loader):.4f}")

# -------------------------
# OPTIONAL: Evaluate on test set
# -------------------------
model.eval()
test_loss = 0
with torch.no_grad():
    for Xb, Yb in test_loader:
        Xb, Yb = Xb.to(device), Yb.to(device)
        pred = model(Xb)
        test_loss += embedding_loss(pred, Yb).item()

print(f"Test Loss: {test_loss/len(test_loader):.4f}")



(53340, 128) (53340, 2048)
(13336, 128) (13336, 2048)
Epoch 1/10 | Train Loss: 0.4299
Epoch 2/10 | Train Loss: 0.3662
Epoch 3/10 | Train Loss: 0.3492
Epoch 4/10 | Train Loss: 0.3391
Epoch 5/10 | Train Loss: 0.3321
Epoch 6/10 | Train Loss: 0.3270
Epoch 7/10 | Train Loss: 0.3234
Epoch 8/10 | Train Loss: 0.3201
Epoch 9/10 | Train Loss: 0.3175
Epoch 10/10 | Train Loss: 0.3146
Test Loss: 0.3228


In [None]:

# -------------------------
# Training loop with Early Stopping
# -------------------------
EPOCHS = 100
patience = 10
best_loss = float("inf")
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for Xb, Yb in train_loader:
        Xb, Yb = Xb.to(device), Yb.to(device)

        pred = model(Xb)
        loss = embedding_loss(pred, Yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f}")

    # -------------------------
    # Early Stopping Check
    # -------------------------
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for Xb, Yb in test_loader:
            Xb, Yb = Xb.to(device), Yb.to(device)
            pred = model(Xb)
            test_loss += embedding_loss(pred, Yb).item()

    test_loss /= len(test_loader)
    print(f"          Test Loss: {test_loss:.4f}")

    # save best model
    if test_loss < best_loss:
        best_loss = test_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_mapper.pth")
        print("✅ Improved! Model saved.")
    else:
        patience_counter += 1
        print(f"⏳ No improvement ({patience_counter}/{patience})")

    # stop if patience exceeded
    if patience_counter >= patience:
        print("⛔ Early stopping triggered!")
        break

print(f"Best Test Loss: {best_loss:.4f}")


Epoch 1/100 | Train Loss: 0.3120
          Test Loss: 0.3221
✅ Improved! Model saved.
Epoch 2/100 | Train Loss: 0.3097
          Test Loss: 0.3222
⏳ No improvement (1/10)
Epoch 3/100 | Train Loss: 0.3068
          Test Loss: 0.3214
✅ Improved! Model saved.
Epoch 4/100 | Train Loss: 0.3042
          Test Loss: 0.3222
⏳ No improvement (1/10)
Epoch 5/100 | Train Loss: 0.3013
          Test Loss: 0.3220
⏳ No improvement (2/10)
Epoch 6/100 | Train Loss: 0.2985
          Test Loss: 0.3222
⏳ No improvement (3/10)
Epoch 7/100 | Train Loss: 0.2952
          Test Loss: 0.3228
⏳ No improvement (4/10)
Epoch 8/100 | Train Loss: 0.2922
          Test Loss: 0.3246
⏳ No improvement (5/10)
Epoch 9/100 | Train Loss: 0.2885
          Test Loss: 0.3254
⏳ No improvement (6/10)
Epoch 10/100 | Train Loss: 0.2850
          Test Loss: 0.3264
⏳ No improvement (7/10)
Epoch 11/100 | Train Loss: 0.2812
          Test Loss: 0.3292
⏳ No improvement (8/10)
Epoch 12/100 | Train Loss: 0.2774
          Test Loss: 0.3302

In [None]:
!pip install laion-clap

Collecting laion-clap
  Downloading laion_clap-1.1.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting h5py
  Using cached h5py-3.15.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (4.7 MB)
Collecting braceexpand
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl (5.9 kB)
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m876.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchlibrosa
  Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Collecting numpy<2.0.0,>=1.23.5
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [3]:
import torch
import laion_clap

device = "cuda" if torch.cuda.is_available() else "cpu"

model = laion_clap.CLAP_Module(enable_fusion=False).to(device)
model.load_ckpt()   # downloads pretrained weights

audio_paths = "/home/ie643_therelutionaries/runcode/misc/1-28808-A-43.wav"

audio_emb = model.get_audio_embedding_from_filelist(
    audio_files=audio_paths,
    use_tensor=True
).to(device)

print(audio_emb.shape)   # e.g. torch.Size([2, 512])


  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
Downloading laion_clap weight files...


URLError: <urlopen error [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:997)>

In [15]:
!pip install torchvision

[0mCollecting torchvision
  Downloading torchvision-0.24.1-cp310-cp310-manylinux_2_28_x86_64.whl (8.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torchvision
Successfully installed torchvision-0.24.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import torch
import laion_clap

device = "cuda" if torch.cuda.is_available() else "cpu"

model = laion_clap.CLAP_Module(enable_fusion=False).to(device)
model.load_ckpt()   # downloads pretrained weights



audio_emb = model.get_audio_embedding_from_filelist(
    ['/home/ie643_therelutionaries/runcode/misc/Acoustic guitar_2.wav'],
    use_tensor=True
).to(device)


print(audio_emb.shape)  

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
Downloading laion_clap weight files...
Download completed!
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.laye