# 睿抗 2025 智海人工智能算法应用赛
**语音情绪识别**：推理测试代码  
在这个笔记本上测试通过后，直接把关键部分复制到main.py即可

In [None]:
import torch
import random
import numpy as np

In [2]:
def set_seed(seed=42):
    random.seed(seed)  # Python 内置随机数
    np.random.seed(seed)  # NumPy 随机数
    torch.manual_seed(seed)  # CPU 上的随机数
    torch.cuda.manual_seed(seed)  # GPU 上的随机数
    torch.cuda.manual_seed_all(seed)  # 多 GPU 情况下的随机数
    torch.backends.cudnn.deterministic = True  # 确保每次卷积结果一致
    torch.backends.cudnn.benchmark = False     # 禁用自动优化

set_seed()

In [3]:
import torchaudio.transforms as T
import torchaudio


========================================  **测试提交函数示例**  =========================================== 

In [2]:
# ! unzip results/wav2vec2-base.zip

Archive:  results/wav2vec2-base.zip
   creating: wav2vec2-base/
  inflating: wav2vec2-base/tokenizer_config.json  
  inflating: wav2vec2-base/vocab.json  
  inflating: wav2vec2-base/special_tokens_map.json  
   creating: wav2vec2-base/.ipynb_checkpoints/
  inflating: wav2vec2-base/README.md  
  inflating: wav2vec2-base/config.json  
  inflating: wav2vec2-base/preprocessor_config.json  
  inflating: wav2vec2-base/pytorch_model.bin  


In [1]:
# ! ls -lh results/wav2vec2_fold5.pt

-rw-r--r-- 1 jovyan 1000 362M Jun 28 12:50 results/wav2vec2_fold5.pt


### 推理函数

In [1]:
import torch, torchaudio, torch.nn.functional as F, random
import torch, torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor, Wav2Vec2Config
from tqdm import tqdm

# ------------ 基本设置 ------------
label_names = ['anger', 'fear', 'happy', 'neutral', 'sad']
PRETRAINED = "results/wav2vec2-base"          # 可换成中文/多语权重
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED)
device = "cpu"
print(device)

# ------------ 模型定义 ------------
class AttentivePool(nn.Module):
    """自注意力池化：权重 = softmax(Linear(tanh(Linear)))"""
    def __init__(self, hidden):
        super().__init__()
        self.linear1 = nn.Linear(hidden, hidden // 2)
        self.linear2 = nn.Linear(hidden // 2, 1)

    def forward(self, h):            # h: [B, T, C]
        a = torch.tanh(self.linear1(h))
        a = self.linear2(a).squeeze(-1)          # [B, T]
        a = torch.softmax(a, dim=1).unsqueeze(-1)
        return (h * a).sum(1)                    # [B, C]

class HF_Wav2Vec2SER(nn.Module):
    def __init__(self, num_cls=5, freeze_feat=True, pool="attn"):
        super().__init__()
        config = Wav2Vec2Config.from_pretrained(PRETRAINED)  # 只取结构配置
        self.encoder = Wav2Vec2Model(config)
        if freeze_feat:
            self.encoder.feature_extractor.requires_grad_(False)

        hid = self.encoder.config.hidden_size           # 768

        # ------- 池化层选择 -------
        if pool == "mean":
            self.pool = lambda h: h.mean(1)
        elif pool == "stat":
            self.pool = lambda h: torch.cat([h.mean(1), h.std(1)], dim=-1)
            hid *= 2                                     # 因为拼接 mean+std
        else:  # "attn"
            self.pool = AttentivePool(hid)

        # ------- 分类头 -------
        self.classifier = nn.Sequential(
            nn.LayerNorm(hid),
            nn.Linear(hid, 256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_cls)
        )

    def forward(self, wav, attn_mask):
        h = self.encoder(wav, attention_mask=attn_mask,
                         return_dict=True).last_hidden_state   # [B,T,C]
        x = self.pool(h)
        return self.classifier(x)
    
# 加载模型
# ------------ 加载 5 折模型 ------------
models = []
for i in tqdm(range(1, 4)):
    m = HF_Wav2Vec2SER(len(label_names)).to(device)
#     m = HF_HuBERT_SER(len(label_names)).to(device)
    m.load_state_dict(torch.load(f"results/wav2vec2_fold{i}.pt", map_location=device), strict=True)
    m.eval()
    models.append(m)

print(f"✓ 成功加载 {len(models)} 个折模型")

@torch.no_grad()
def predict(audio: torch.Tensor, sr: int) -> str:
    target_sr, max_len = 16_000, 16_000 * 3

    # 预处理：重采样 + 单声道 + 裁剪/填充
    if sr != target_sr:
        audio = torchaudio.functional.resample(audio, sr, target_sr)
    if audio.dim() == 2 and audio.size(0) > 1:
        audio = audio.mean(0)
    audio = audio[:max_len] if audio.numel() > max_len else F.pad(audio, (0, max_len - audio.numel()))

    # 特征提取
    inputs = feature_extractor(
        [audio.squeeze().numpy()],
        sampling_rate=target_sr,
        return_tensors="pt",
        padding=True,
        return_attention_mask=True
    )
    wav, attn = inputs.input_values.to(device), inputs.attention_mask.to(device)

    # 集成推理：Softmax → 概率平均
    prob_sum = torch.zeros((1, len(label_names)), device=device)
    for m in models:
        prob_sum += F.softmax(m(wav, attn), dim=-1)

    pred_idx = prob_sum.argmax(1).item()
    return label_names[pred_idx]

cpu


100%|██████████| 3/3 [00:16<00:00,  5.48s/it]

✓ 成功加载 3 个折模型





### 单条测试
用于测试能不能调通

In [9]:
predict_x, sr = torchaudio.load('./datasets/67fc7ccbb88b01da6626732d-momodel/train/sad/112.wav')
predict(predict_x, sr)

'sad'

### 批量测试
用于预估时间

In [3]:
import glob
import time
# ----------------- 基准测试 -----------------
def benchmark(n=10, root="./datasets/67fc7ccbb88b01da6626732d-momodel/train"):
    files = random.sample(glob.glob(f"{root}/**/*.wav", recursive=True), n)

    # warm-up（第一次跑会触发编译与缓存）
    wav, sr = torchaudio.load(files[0]); _ = predict(wav, sr)

    t0 = time.perf_counter()
    for f in files:
        wav, sr = torchaudio.load(f)
        _ = predict(wav, sr)
    t1 = time.perf_counter()

    total = t1 - t0
    print(f"▶  预测 {n:>2} 条：{total:.3f}s  |  平均 {total/n:.3f}s/条")

benchmark(10)
benchmark(20)

2025-06-27 14:21:26.563955: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-27 14:21:26.565133: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-27 14:21:26.568989: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-27 14:21:26.578852: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-27 14:21:26.592752: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

▶  预测 10 条：35.675s  |  平均 3.568s/条
▶  预测 20 条：68.895s  |  平均 3.445s/条


### 最后测试main.py 文件，这里用main_test.py 暂时替代

In [3]:
from main_test import predict
import torchaudio
import torchaudio.transforms as T
# import torchaudio.functional as audiof
# import numpy as np

predict_x, sr = torchaudio.load('./datasets/67fc7ccbb88b01da6626732d-momodel/train/sad/2.wav')
predict(predict_x, sr)

len models 5


'sad'