In [67]:
import torch
import torchaudio
import os
import numpy as np
import pandas as pd
from torch import nn
from pydub import AudioSegment

In [70]:
if torch.cuda.is_available():  
  device = torch.device('cuda:0')
else:  
  device = torch.device('cpu')
print('running on', device)

running on cpu


In [71]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        self.cnn_layers = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=7, stride=1),   # 252, 252, 16
            #in-channel, out, filter size, stride
            nn.Tanh(),
            #activate function
            nn.MaxPool2d(2),      # 126, 126, 16
            #poolsize 
            
            nn.Conv2d(16, 32, kernel_size=5, stride=1),  # 122, 122, 32
            nn.Tanh(),
            nn.MaxPool2d(2),      # 61, 61, 32
            
            nn.Conv2d(32, 64, kernel_size=6, stride=1), # 56, 56, 64
            nn.Tanh(),
            nn.MaxPool2d(2),      # 28, 28, 64
            
            nn.Conv2d(64, 128, kernel_size=5, stride=1), # 24, 24, 128
            nn.Tanh(),
            nn.MaxPool2d(2),       # 12, 12, 128
            
            nn.Flatten()
            #one column
        )

        self.linear_layers = nn.Sequential(
            nn.Linear(12*12*128, 2048),
            nn.Linear(2048, 10)
        )

    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

In [107]:
path_model = 'models/nn_classifier.pth'
path_audio = 'test_sounds/rahu.wav'
model = torch.load(path_model, map_location=device)

In [108]:

genres = ['disco','metal','blues','jazz','country','hiphop','rock','classical','pop','reggae']

In [109]:
# INPUT AUDIO PREPARATIONS

input_audio = AudioSegment.from_wav(path_audio)

# transformation to mono, downsampling
input_audio = input_audio.set_channels(1)
input_audio = input_audio.set_frame_rate(22050)

# splitting into chunks of 3 seconds for processing in NN
n_chunks = len(input_audio)//3000

for i in range(n_chunks-1):
    input_audio[(i*3000): ((i+1)*3000)].export('temp_audio_files/file{}.wav'.format(i), format='wav')

In [110]:
# GENERATING SPECTROGRAMS
# +++ NOTICE THAT THE FFT MUST BE THE SAME AS FOR THE TRAINING DATA SET (515) TO GET THE SAME TENSOR DIMENSION +++

spectrograms = []

for i in range(n_chunks-1):
    wf, sr = torchaudio.load('temp_audio_files/file{}.wav'.format(i))
    spec = torchaudio.transforms.Spectrogram(n_fft=515, power=None, return_complex=True)(wf)
    spectrograms.append(spec)

In [111]:
# create empty complex tensor and fill it with spectrograms of input audio
x = torch.empty((len(spectrograms), 1, 258, 258), dtype=torch.complex64)

# fill tensor with spectrograms of input audio
for i, s in enumerate(spectrograms): x[i] = s

# analyse the tensor with neural network
with torch.no_grad():
    result = model(x.real)
    result = nn.Softmax(dim=1)(result)
    result = result.detach().numpy()

# export .csv file with percentages of music genre
table = []
for i, j in enumerate(genres):
    table.append([j, round((np.mean(result[:, i])*100), 2)])
pd.DataFrame(table).to_csv('result.csv', index=False, header=False)

In [112]:
# remove audio file chunks
files_to_remove = os.listdir('temp_audio_files')
for f in files_to_remove:os.remove(str('temp_audio_files/{}'.format(f)))