In [14]:
import torch
import librosa
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [16]:
SR = 22050           # sampling rate
DURATION = 30        # seconds to use per clip
N_MFCC = 40
HOP_LENGTH = 512
MAX_FRAMES = int(np.ceil(SR * DURATION / HOP_LENGTH))  # around 1293 for 30s

In [17]:
class ConvLSTMGenre(nn.Module):
    def __init__(self, n_mfcc=N_MFCC, hidden_size=128, n_classes=10, n_lstm_layers=2, dropout=0.3):
        super().__init__()
        # CNN extractor on frequency axis per time-slice using Conv1d across time
        # Input: (batch, n_mfcc, time)
        self.conv1 = nn.Conv1d(in_channels=n_mfcc, out_channels=128, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=2)  # halves time dimension

        self.conv2 = nn.Conv1d(128, 256, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(256)
        self.pool2 = nn.MaxPool1d(kernel_size=2)  # halves again

        # after conv/pool, time dimension reduced: MAX_FRAMES / 4 approx
        # LSTM expects (batch, seq_len, feat); we'll transpose appropriately
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=256, hidden_size=hidden_size, num_layers=n_lstm_layers,
                            batch_first=True, bidirectional=True, dropout=dropout if n_lstm_layers>1 else 0.0)

        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_size*2, 128)  # *2 for bidirectional
        self.fc2 = nn.Linear(128, n_classes)

    def forward(self, x):
        # x: (batch, n_mfcc, time)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)   # (batch, 128, time/2)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)   # (batch, 256, time/4)

        # transpose to (batch, time_seq, feat)
        x = x.permute(0, 2, 1)  # (batch, seq_len, 256)

        # LSTM
        out, (hn, cn) = self.lstm(x)  # out: (batch, seq_len, hidden*2)
        # use mean pooling over time
        out = torch.mean(out, dim=1)  # (batch, hidden*2)

        out = self.dropout(F.relu(self.fc1(out)))
        out = self.fc2(out)
        return out

In [18]:
import torch

# Load checkpoint
checkpoint = torch.load("/content/Music_genre_model.pth", map_location='cpu', weights_only=False)

# Recreate model architecture
loaded_model = ConvLSTMGenre(
    n_mfcc=checkpoint['n_mfcc'],
    hidden_size=checkpoint['hidden_size'],
    n_classes=checkpoint['n_classes'],
    n_lstm_layers=checkpoint['n_lstm_layers']
)

# Load trained weights
loaded_model.load_state_dict(checkpoint['model_state_dict'])
loaded_model.eval()  # evaluation mode

# Restore label classes
classes = checkpoint['label_classes']


In [19]:
def load_and_extract_mfcc(path, augment=False):
    # load full or truncated to DURATION
    try:
        y, sr = librosa.load(path, sr=SR, duration=DURATION)
    except Exception as e:
        raise e
    # if too short, pad
    if len(y) < SR * DURATION:
        pad_len = SR * DURATION - len(y)
        y = np.pad(y, (0, int(pad_len)), mode='constant')
    # augmentation
    if augment:
        y = augment_audio(y, sr)
        # after augment we may have different length -> ensure trim/pad
        if len(y) < SR * DURATION:
            y = np.pad(y, (0, int(SR * DURATION - len(y))), mode='constant')
        if len(y) > SR * DURATION:
            y = y[:SR * DURATION]
    # compute MFCC (shape: n_mfcc x t)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH)
    # transpose -> (time_steps, n_mfcc)
    mfcc = mfcc.T.astype(np.float32)
    # pad or truncate to MAX_FRAMES
    if mfcc.shape[0] < MAX_FRAMES:
        pad_width = MAX_FRAMES - mfcc.shape[0]
        mfcc = np.pad(mfcc, ((0, pad_width), (0,0)), mode='constant')
    elif mfcc.shape[0] > MAX_FRAMES:
        mfcc = mfcc[:MAX_FRAMES, :]
    return mfcc  # shape: (MAX_FRAMES, N_MFCC)


In [22]:
def predict_genre(file_path):
    mfcc = load_and_extract_mfcc(file_path).T  # MFCC extraction
    X = torch.from_numpy(mfcc).unsqueeze(0)    # add batch dim
    loaded_model.eval()
    with torch.no_grad():
        output = loaded_model(X)
        pred = torch.argmax(output, dim=1).item()
    return classes[pred]


In [23]:
song_path = "/content/jazz.00001.wav"
print("Predicted genre:", predict_genre(song_path))

Predicted genre: jazz


In [21]:
song_path = "/content/pop.00003.wav"
print("Predicted genre:", predict_genre(song_path))

Predicted genre: pop
