In [32]:
import os

audio_path = r'E:\大创\data\MP3'  # 确保这是正确的路径
audio_paths = []
label_texts = []

# 遍历指定的音频文件夹
for folder in os.listdir(audio_path):
    folder_path = os.path.join(audio_path, folder)

    # 确保只处理文件夹，跳过文件
    if os.path.isdir(folder_path) and folder.endswith("new"):
        print(f"Processing folder: {folder}")
        label = int(folder.split("_")[0])  # 假设标签在文件夹名中，以 "_" 分隔

        # 遍历该文件夹中的音频文件
        for audio_file in os.listdir(folder_path):
            audio_file_path = os.path.join(folder_path, audio_file)

            # 只处理音频文件（可以根据实际文件扩展名调整条件）
            if audio_file.endswith(".mp3"):
                print(f"Adding file: {audio_file_path}")
                audio_paths.append(audio_file_path)
                label_texts.append(label - 1)

        # 如果你只想处理文件夹下的第一个音频文件，可以在这里加上 break
        # break

# 输出结果
print(f"Total audio files: {len(audio_paths)}")
print(f"Total labels: {len(label_texts)}")


Processing folder: 1_new
Adding file: E:\大创\data\MP3\1_new\0.mp3
Adding file: E:\大创\data\MP3\1_new\1.mp3
Adding file: E:\大创\data\MP3\1_new\10.mp3
Adding file: E:\大创\data\MP3\1_new\100.mp3
Adding file: E:\大创\data\MP3\1_new\101.mp3
Adding file: E:\大创\data\MP3\1_new\102.mp3
Adding file: E:\大创\data\MP3\1_new\103.mp3
Adding file: E:\大创\data\MP3\1_new\104.mp3
Adding file: E:\大创\data\MP3\1_new\105.mp3
Adding file: E:\大创\data\MP3\1_new\106.mp3
Adding file: E:\大创\data\MP3\1_new\107.mp3
Adding file: E:\大创\data\MP3\1_new\108.mp3
Adding file: E:\大创\data\MP3\1_new\109.mp3
Adding file: E:\大创\data\MP3\1_new\11.mp3
Adding file: E:\大创\data\MP3\1_new\110.mp3
Adding file: E:\大创\data\MP3\1_new\111.mp3
Adding file: E:\大创\data\MP3\1_new\112.mp3
Adding file: E:\大创\data\MP3\1_new\113.mp3
Adding file: E:\大创\data\MP3\1_new\114.mp3
Adding file: E:\大创\data\MP3\1_new\115.mp3
Adding file: E:\大创\data\MP3\1_new\116.mp3
Adding file: E:\大创\data\MP3\1_new\117.mp3
Adding file: E:\大创\data\MP3\1_new\118.mp3
Adding file: E:

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AudioClassifier(nn.Module):
    def __init__(self, num_classes=4):
        super(AudioClassifier, self).__init__()

        # 输入形状为 [batch_size, 128, 2813] -> 128 个特征，2813 个时间步
        self.conv1 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)  # 128 -> 256
        self.drop1 = nn.Dropout(0.2)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)  # 256 -> 512
        self.drop2 = nn.Dropout(0.2)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)  # 最大池化减少维度

        # 添加归一化层
        self.batch_norm1 = nn.BatchNorm1d(128)  # 归一化输入的 128 个特征
        self.batch_norm2 = nn.BatchNorm1d(256)  # 归一化卷积后的 256 个特征
        self.batch_norm3 = nn.BatchNorm1d(512)  # 归一化卷积后的 512 个特征

        # 分类头
        self.fc1 = nn.Linear(512 * (2813 // 4), 1024)  # 池化后维度大小，假设池化两次减少了四分之一
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)  # 转换为 [batch_size, 128, 2813]，以适应 Conv1D

        # 添加归一化
        x = self.batch_norm1(x)  # 归一化
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)  # 最大池化

        x = self.batch_norm2(x)  # 归一化
        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)  # 最大池化

        x = x.view(x.size(0), -1)  # 展平为一维向量
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)  # 输出四分类的 logits

        return x



# # 示例使用
# audio_data = torch.randn(16, 2813, 128)  # 输入形状为 [16, 2813, 128]
# model = AudioClassifier(num_classes=4)
# logits = model(audio_data)
#
# # 输出类别
# probs = F.softmax(logits, dim=1)  # 计算每个类别的概率
# predicted_classes = torch.argmax(probs, dim=1)  # 选择最大概率的类别
# print("Logits:", logits)  # 输出 logits（原始得分）
# print("Predicted Classes:", predicted_classes)  # 输出预测的类别
# print("Probabilities:", probs)  # 输出每个类别的概率

In [34]:
from sklearn.model_selection import train_test_split
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel

import torch
import torch.nn.functional as F

def pad_or_truncate_audio_features(audio_features, target_length=2813):
    """
    填充或裁剪音频特征使它们具有相同的目标长度。
    """
    current_length = audio_features.size(1)

    if current_length < target_length:
        # 使用零填充
        padding_size = target_length - current_length
        audio_features = F.pad(audio_features, (0, padding_size), "constant", 0)
    elif current_length > target_length:
        # 裁剪到目标长度
        audio_features = audio_features[:, :target_length]

    return audio_features


# 这里是音频特征提取的函数
def extract_audio_features(audio_path):
    # 加载音频文件
    y, sr = librosa.load(audio_path, sr=None)
    # 提取 Mel-spectrogram 特征
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    # 将其转换为 log-mel spectrogram
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    return log_mel_spectrogram

# 自定义 Dataset
class AudioTextDataset(Dataset):
    def __init__(self, audio_paths, labels, processor, label_map, target_length=2813):
        self.audio_paths = audio_paths
        self.labels = labels
        self.processor = processor
        self.label_map = label_map
        self.target_length = target_length  # 这里初始化 target_length

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):

        # 提取音频特征
        audio_features = extract_audio_features(self.audio_paths[idx])
        audio_features = torch.tensor(audio_features).float()

        # 填充或裁剪音频特征
        audio_features = pad_or_truncate_audio_features(audio_features, self.target_length)


        # 获取标签文本并进行映射
        label = self.labels[idx]
        # label_idx = self.label_map[label]  # 将标签映射为对应的索引

        return torch.tensor(audio_features).float(), torch.tensor(label).int()
    def splitData(self, rate=0.8):
        train_audio_paths, test_audio_paths, train_labels, test_labels = train_test_split(
            self.audio_paths, self.labels, test_size=(1 - rate), random_state=42
        )

        # 返回新的训练集和测试集
        train_dataset = AudioTextDataset(train_audio_paths, train_labels, self.processor, self.label_map)
        test_dataset = AudioTextDataset(test_audio_paths, test_labels, self.processor, self.label_map)

        return train_dataset, test_dataset


In [35]:
# 创建数据集
dataset = AudioTextDataset(audio_paths, label_texts, processor=None, label_map=None,target_length=2813)

# 划分数据集
train_dataset, test_dataset = dataset.splitData(rate=0.8)

# 打印数据集长度
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

训练集大小: 1391
测试集大小: 348


In [36]:
train_dataloader ,test_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True),DataLoader(test_dataset,batch_size=16,shuffle=True)

In [37]:
import torch.optim as optim
# 创建模型实例
model = AudioClassifier(num_classes=4)

# 定义损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
epochs = 5
for epoch in range(epochs):
    model.train()  # 设置模型为训练模式
    total_loss = 0.0
    # 在训练过程中
    for batch in train_dataloader:
        audio_data, labels = batch
        # 将标签转换为 Long 类型
        labels = labels.long()

        # 调整输入的维度为 [batch_size, 128, 2813]
        audio_data = audio_data.permute(0, 2, 1)  # 从 [batch_size, 2813, 128] 转为 [batch_size, 128, 2813]

        # 清零梯度
        optimizer.zero_grad()

        # 前向传播
        logits = model(audio_data)

        # 计算损失
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # 反向传播和优化
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader)}")
# 在测试过程中
model.eval()  # 设置模型为评估模式
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        audio_data, labels = batch
        # 将标签转换为 Long 类型
        labels = labels.long()

        audio_data = audio_data.permute(0, 2, 1)  # 调整输入的维度
        logits = model(audio_data)
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")

  return torch.tensor(audio_features).float(), torch.tensor(label).int()


Epoch 1/5, Loss: 18.154671928568654
Epoch 2/5, Loss: 1.0499676575534171
Epoch 3/5, Loss: 4.176999699350355
Epoch 4/5, Loss: 4.377821406953777
Epoch 5/5, Loss: 3.4635480953642737
Accuracy: 99.71264367816092%
