In [None]:
import os, torch, torchaudio, pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bundle = torchaudio.pipelines.WAV2VEC2_BASE
wav2vec_model = bundle.get_model().to(device).eval()

TRAIN_AUDIO_DIR = "../dataset/audios_train"
TEST_AUDIO_DIR = "../dataset/audios_test"
TRAIN_CSV_PATH = "../dataset/train.csv"
TEST_CSV_PATH = "../dataset/test.csv"

BEST_LEARNING_RATE = 0.0004956131596941485
BEST_BATCH_SIZE = 32
BEST_EPOCHS = 80
BEST_DROPOUT_RATE = 0.3183369837123387
BEST_NUM_HIDDEN_LAYERS = 2
BEST_HIDDEN_DIM = 128
FINAL_MODEL_PATH = "final_regression_model_submit.pt"

In [3]:
class AudioDataset(Dataset):
    def __init__(self, csv_path, audio_dir, is_test=False):
        self.df = pd.read_csv(csv_path)
        self.audio_dir = audio_dir
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        waveform, sr = torchaudio.load(os.path.join(self.audio_dir, row['filename']))
        if sr != bundle.sample_rate:
            waveform = torchaudio.transforms.Resample(sr, bundle.sample_rate)(waveform)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        waveform = waveform.to(device)

        if self.is_test:
            return waveform, row['filename']
        else:
            label = torch.tensor(float(row['label']), dtype=torch.float32).to(device)
            return waveform, label

In [4]:
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

class RegressionHead(nn.Module):
    def __init__(self, input_dim, num_hidden_layers, hidden_dim, dropout_rate):
        super().__init__()
        layers = [nn.LayerNorm(input_dim)]
        current_dim = input_dim
        for _ in range(num_hidden_layers):
            layers.extend([
                nn.Linear(current_dim, hidden_dim),
                Swish(),
                nn.Dropout(dropout_rate)
            ])
            current_dim = hidden_dim
        layers.append(nn.Linear(current_dim, 1))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x).squeeze(1)

In [5]:
def extract_features(waveform):
    with torch.no_grad():
        if waveform.dim() == 3:
            waveform = waveform.squeeze(1)
        features, _ = wav2vec_model.extract_features(waveform)
        selected_layers = features[-3:]
        stat_feats = [torch.cat([layer.mean(1), layer.std(1)], dim=1) for layer in selected_layers]
        return torch.cat(stat_feats, dim=1)

def collate_fn_train(batch):
    waveforms = [x[0] for x in batch]
    labels = torch.stack([x[1] for x in batch])
    return waveforms, labels

def collate_fn_test(batch):
    waveforms = [x[0] for x in batch]
    filenames = [x[1] for x in batch]
    return waveforms, filenames

In [6]:
def train_final():
    train_dataset = AudioDataset(TRAIN_CSV_PATH, TRAIN_AUDIO_DIR, is_test=False)
    train_loader = DataLoader(train_dataset, batch_size=BEST_BATCH_SIZE, shuffle=True, collate_fn=collate_fn_train)

    input_dim = extract_features(torch.randn(1, 16000).to(device)).shape[1]
    final_model = RegressionHead(input_dim, BEST_NUM_HIDDEN_LAYERS, BEST_HIDDEN_DIM, BEST_DROPOUT_RATE).to(device)

    optimizer = torch.optim.AdamW(final_model.parameters(), lr=BEST_LEARNING_RATE)
    criterion = nn.MSELoss()

    print("Starting final training with best hyperparameters...")
    for epoch in range(BEST_EPOCHS):
        final_model.train()
        total_loss = 0.0
        for waveforms, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{BEST_EPOCHS} [Train]"):
            feats = torch.cat([extract_features(wf.unsqueeze(0)).cpu() for wf in waveforms], dim=0).to(device)
            preds = final_model(feats)
            loss = criterion(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{BEST_EPOCHS}: Train Loss={avg_train_loss:.4f}")

    torch.save(final_model.state_dict(), FINAL_MODEL_PATH)
    print(f"✔️ Final model saved to {FINAL_MODEL_PATH}")

In [7]:
def evaluate_final():
    if not os.path.exists(FINAL_MODEL_PATH):
        print(f"Error: Final model not found at {FINAL_MODEL_PATH}. Please run train_final() first.")
        return

    test_dataset = AudioDataset(TEST_CSV_PATH, TEST_AUDIO_DIR, is_test=True)
    test_loader = DataLoader(test_dataset, batch_size=BEST_BATCH_SIZE, shuffle=False, collate_fn=collate_fn_test)

    input_dim = extract_features(torch.randn(1, 16000).to(device)).shape[1]
    final_model = RegressionHead(input_dim, BEST_NUM_HIDDEN_LAYERS, BEST_HIDDEN_DIM, BEST_DROPOUT_RATE).to(device)
    final_model.load_state_dict(torch.load(FINAL_MODEL_PATH))
    final_model.eval()

    print("Starting final evaluation on the test set...")
    all_preds = []
    all_filenames = []

    with torch.no_grad():
        for waveforms, filenames in tqdm(test_loader, desc="Evaluating"):
            feats = torch.cat([extract_features(wf.unsqueeze(0)).cpu() for wf in waveforms], dim=0).to(device)
            preds = final_model(feats)
            all_preds.extend(preds.cpu().numpy())
            all_filenames.extend(filenames)

    output_df = pd.DataFrame({
        'filename': all_filenames,
        'label': all_preds
    })
    output_df.iloc[:, 1] = output_df.iloc[:, 1].apply(lambda x: round(x * 2) / 2)
    output_df.to_csv("test_predictions_regression_submit.csv", index=False)
    print("✔️ Predictions saved to test_predictions_regression_submit.csv")

In [None]:
if __name__ == "__main__":
    train_final()
    evaluate_final()

Starting final training with best hyperparameters...


Epoch 1/80 [Train]: 100%|██████████| 14/14 [01:47<00:00,  7.65s/it]


Epoch 1/80: Train Loss=4.1807


Epoch 2/80 [Train]: 100%|██████████| 14/14 [02:02<00:00,  8.76s/it]


Epoch 2/80: Train Loss=1.6997


Epoch 3/80 [Train]: 100%|██████████| 14/14 [02:05<00:00,  8.96s/it]


Epoch 3/80: Train Loss=1.6606


Epoch 4/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.01s/it]


Epoch 4/80: Train Loss=1.4475


Epoch 5/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.01s/it]


Epoch 5/80: Train Loss=1.2283


Epoch 6/80 [Train]: 100%|██████████| 14/14 [02:05<00:00,  8.99s/it]


Epoch 6/80: Train Loss=1.1366


Epoch 7/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.03s/it]


Epoch 7/80: Train Loss=1.1667


Epoch 8/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.03s/it]


Epoch 8/80: Train Loss=1.0944


Epoch 9/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.06s/it]


Epoch 9/80: Train Loss=0.9649


Epoch 10/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.03s/it]


Epoch 10/80: Train Loss=0.9379


Epoch 11/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.04s/it]


Epoch 11/80: Train Loss=0.8853


Epoch 12/80 [Train]: 100%|██████████| 14/14 [02:06<00:00,  9.02s/it]


Epoch 12/80: Train Loss=0.8831


Epoch 13/80 [Train]:  64%|██████▍   | 9/14 [01:15<00:43,  8.68s/it]

In [None]:
# Plot histogram of predicted labels
plt.figure(figsize=(8, 5))
plt.hist(output_df['label'], bins=[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5], edgecolor='black', rwidth=0.9)
plt.xticks([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
plt.xlabel("Predicted MOS Score")
plt.ylabel("Frequency")
plt.title("Distribution of Predicted Grammar Scores")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Sort by filename (or any order) for trendline visualization
df_sorted = output_df.sort_values('filename')

plt.figure(figsize=(10, 4))
plt.plot(df_sorted['label'].values, marker='o', linestyle='-', color='teal')
plt.xlabel("Test Sample Index (sorted by filename)")
plt.ylabel("Predicted MOS Score")
plt.title("Trend of Predicted Grammar Scores")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# a statistical summary
plt.figure(figsize=(4, 6))
plt.boxplot(output_df['label'], vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.ylabel("Predicted MOS Score")
plt.title("Boxplot of Predicted Grammar Scores")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()