In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

ModuleNotFoundError: No module named 'torch'

In [3]:
from NeuralNetwork.torch_cnn import MEL_CNN, CHR_CNN

ModuleNotFoundError: No module named 'torch'

In [None]:
# 图像预处理
mel_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# 加载数据集
train_dataset = datasets.ImageFolder(root="./dataset/mel_format", transform=mel_transform)
test_dataset = datasets.ImageFolder(root="./dataset/mel_test", transform=mel_transform)

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [None]:
# 初始化模型
device = torch.device("cuda")
model = MEL_CNN().to(device)

In [None]:
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 训练模型
weight = 5
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

    # 在测试集上评估模型
    if epoch % weight == 4:
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Validation Accuracy: {100 * correct / total}%')

# 保存模型
torch.save(model.state_dict(), './model/torch_mel_model.pth')

In [None]:
# 图像预处理
chr_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((12, 1293)),
    transforms.ToTensor(),
])

# 加载数据集
train_dataset = datasets.ImageFolder(root="./dataset/chr_format", transform=chr_transform)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)

chr_model = CHR_CNN().to(device)
optimizer = optim.Adam(chr_model.parameters(), lr=0.0001)

In [None]:
# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
    chr_model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = chr_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# 保存模型
torch.save(chr_model.state_dict(), './model/torch_chr_model.pth')

In [None]:
model.load_state_dict(torch.load("./model/torch_mel_model.pth"))
chr_model.load_state_dict(torch.load("./model/torch_chr_model.pth"))

In [None]:
import numpy as np
import librosa
from PIL import Image
from random import randint

y, sr = librosa.load("./dataset/test_long/DSM-V.mp3")
# y, sr = librosa.load("./dataset/audio_format/classical/classical.00001.wav")

spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)

mel_pic = librosa.power_to_db(spectrogram, ref=np.max)

harmonic = librosa.effects.harmonic(y)  
harmonic_features = librosa.feature.chroma_cqt(y=harmonic, sr=sr)  

# mfccs = librosa.feature.mfcc(y=y, sr=sr)

duration = len(y) / sr

mel_predictions = []
times = round(duration / 5)
for i in range(times):
    start_pos = randint((mel_pic.shape[1] - 128) // times * i, (mel_pic.shape[1] - 128) // times * (i + 1))
    img = mel_transform(Image.fromarray(mel_pic[:, start_pos:start_pos + 128])).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs: torch.Tensor = model(img)
        prediction = outputs.data.cpu().numpy()
        mel_predictions.append(prediction)

mel_predictions = np.mean(mel_predictions, axis=0)
mel_predictions[mel_predictions < 0] = 0
print(mel_predictions)
print(["pop", "classical", "pop", "pop", "pop", "jazz", "rock", "pop", "pop", "rock"][np.argmax(mel_predictions, axis=1)[0]])


chr_predictions = []
for i in range(0, round(duration), 30):
    if harmonic_features.shape[1] <= (i + 1) * 1293: 
        break
    img = chr_transform(Image.fromarray(harmonic_features[:, i * 1293:(i + 1) * 1293])).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs: torch.Tensor = chr_model(img)
        prediction = outputs.data.cpu().numpy()
        chr_predictions.append(prediction)

chr_predictions = np.mean(chr_predictions, axis=0)
chr_predictions[chr_predictions < 0] = 0
print(chr_predictions)
print(["pop", "classical", "pop", "pop", "pop", "jazz", "rock", "pop", "pop", "rock"][np.argmax(chr_predictions, axis=1)[0]])


class_index = np.argmax((chr_predictions * 2 + mel_predictions) / 3, axis=1)[0]
print(["pop", "classical", "pop", "pop", "pop", "jazz", "rock", "pop", "pop", "rock"][class_index])