In [None]:
!unzip -q dataset.zip


In [None]:
!pip install librosa transformers datasets torchaudio --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m836.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
import librosa
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from tqdm import tqdm

# Load model & processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()  # Inference mode

def extract_wav2vec_features(file_path):
    speech, sr = librosa.load(file_path, sr=16000)  # Load and resample to 16kHz
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
    with torch.no_grad():
        features = model(input_values).last_hidden_state  # (1, time_steps, feature_dim)
    # Take mean across time dimension to get a fixed-length vector
    return features.mean(dim=1).squeeze().numpy()

# Load training CSV
train_df = pd.read_csv("dataset/train.csv")

# Path to audios
audio_folder = "dataset/audios_train"

# Extract features for each audio file
X_train = []
y_train = []

for i, row in tqdm(train_df.iterrows(), total=len(train_df)):
    audio_file = os.path.join(audio_folder, row["filename"])
    features = extract_wav2vec_features(audio_file)
    X_train.append(features)
    y_train.append(row["label"])  # assuming label column is 'score'

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 444/444 [4:59:30<00:00, 40.47s/it]
  X_train = torch.tensor(X_train)


In [None]:
import pandas as pd

train_df = pd.read_csv("dataset/train.csv")
print(train_df.columns)


Index(['filename', 'label'], dtype='object')


In [None]:
import torch
from sklearn.model_selection import train_test_split

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_train.numpy(), dtype=torch.float32)
y_tensor = torch.tensor(y_train.numpy(), dtype=torch.float32).unsqueeze(1)

# Train-validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create Datasets and Loaders
train_dataset = torch.utils.data.TensorDataset(X_tr, y_tr)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)


In [None]:
import torch.nn as nn

class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x)


In [None]:
model = MLPRegressor(input_dim=X_tensor.shape[1])


In [None]:
import torch.optim as optim
from scipy.stats import pearsonr

# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for xb, yb in train_loader:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val).squeeze().numpy()
        val_true = y_val.squeeze().numpy()
        pearson_corr, _ = pearsonr(val_preds, val_true)

    print(f" Epoch {epoch+1:02d} | Train Loss: {train_loss:.4f} | Pearson: {pearson_corr:.4f}")


 Epoch 01 | Train Loss: 75.7969 |  Pearson: -0.1340
 Epoch 02 | Train Loss: 26.7540 |  Pearson: -0.1225
 Epoch 03 | Train Loss: 17.6739 |  Pearson: -0.1068
 Epoch 04 | Train Loss: 15.1165 |  Pearson: -0.0805
 Epoch 05 | Train Loss: 15.7304 |  Pearson: -0.0615
 Epoch 06 | Train Loss: 14.7131 |  Pearson: -0.0244
 Epoch 07 | Train Loss: 15.3675 |  Pearson: 0.0129
 Epoch 08 | Train Loss: 14.1034 |  Pearson: 0.0564
 Epoch 09 | Train Loss: 14.2243 |  Pearson: 0.1486
 Epoch 10 | Train Loss: 13.5105 |  Pearson: 0.1762
 Epoch 11 | Train Loss: 12.7650 |  Pearson: 0.2413
 Epoch 12 | Train Loss: 12.9179 |  Pearson: 0.2924
 Epoch 13 | Train Loss: 11.5960 |  Pearson: 0.3577
 Epoch 14 | Train Loss: 12.7501 |  Pearson: 0.4313
 Epoch 15 | Train Loss: 13.2081 |  Pearson: 0.4445
 Epoch 16 | Train Loss: 11.5714 |  Pearson: 0.4648
 Epoch 17 | Train Loss: 10.2365 |  Pearson: 0.4633
 Epoch 18 | Train Loss: 10.3440 |  Pearson: 0.4799
 Epoch 19 | Train Loss: 10.1568 |  Pearson: 0.5029
 Epoch 20 | Train Loss: 9

In [None]:
def load_audio(file_path):
    waveform, sr = torchaudio.load(file_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # convert to mono
    return waveform.squeeze()  # remove channel dim if needed


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor

# Load processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model_wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
model_wav2vec.eval()


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [None]:
input_values = processor(load_audio(path), sampling_rate=16000, return_tensors="pt").input_values.to(device)


In [None]:
import numpy as np

try:
    test_embeddings = list(np.load("test_embeddings_partial.npy", allow_pickle=True))
    print(f"Loaded {len(test_embeddings)} embeddings.")
except:
    test_embeddings = []
    print("No saved file found. Starting from scratch.")



No saved file found. Starting from scratch.


In [None]:
start_idx = len(test_embeddings)
print(f"Resuming from index: {start_idx}")


Resuming from index: 0


In [None]:
import torch
from tqdm import tqdm
import os
import pandas as pd
import torchaudio

# Helper to load audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.squeeze()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_wav2vec = model_wav2vec.to(device)

# List to store successful results
test_embeddings = []
valid_files = []

# Loop through test files
for file in tqdm(test_df['filename']):
    path = os.path.join("dataset/audios_test", file)
    try:
        waveform = load_audio(path)
        input_values = processor(waveform, sampling_rate=16000, return_tensors="pt").input_values.to(device)
        with torch.no_grad():
            embedding = model_wav2vec(input_values).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        test_embeddings.append(embedding)
        valid_files.append(file)
    except Exception as e:
        print(f"Skipping {file}: {e}")

# Convert to DataFrame
X_test = pd.DataFrame(test_embeddings)
test_filenames = valid_files


 57%|█████▋    | 112/195 [55:59<39:10, 28.32s/it]

 Skipping audio_159.wav: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 2, 2915328]


100%|██████████| 195/195 [2:00:18<00:00, 37.02s/it]


In [None]:
# Convert to tensor and predict
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    preds = model(X_test_tensor).squeeze().cpu().numpy()


In [None]:
# Create DataFrame with filename and predicted score
submission_df = pd.DataFrame({
    "filename": test_filenames,
    "mos": preds
})

# Just to be safe, round the scores between 0 to 5
submission_df["mos"] = submission_df["mos"].clip(0, 5)

# Save to CSV
submission_df.to_csv("submission.csv", index=False)
