In [1]:
import pandas as pd
import librosa
import numpy as np
import os
import torch

In [3]:
def pad_or_truncate(cqt, fixed_timesteps):
    if cqt.shape[1] > fixed_timesteps:
        return cqt[:, :fixed_timesteps]
    else:
        pad_width = fixed_timesteps - cqt.shape[1]
        return np.pad(cqt, ((0, 0), (0, pad_width)), mode='constant')

In [4]:
def extract_features(audio_path):
    y, sr = librosa.load(audio_path,sr=None)
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = pad_or_truncate(mfcc, 150)
    
    cqt = librosa.cqt(y, sr=sr)
    cqt = librosa.amplitude_to_db(abs(cqt))
    cqt = pad_or_truncate(cqt, 150)
    return mfcc, cqt

In [5]:
import torch.nn as nn
import torch.nn.functional as F

class Res2NetBlock(nn.Module):
    def __init__(self, in_channels, out_channels, scales=4):
        super(Res2NetBlock, self).__init__()
        self.scales = scales

        # First 1x1 conv layer to reduce the input channels
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)

        # Convolutional layers for each scale
        self.convs = nn.ModuleList([
            nn.Conv2d(out_channels // scales, out_channels // scales, kernel_size=3, padding=1, bias=False)
            for _ in range(scales - 1)
        ])
        self.bns = nn.ModuleList([nn.BatchNorm2d(out_channels // scales) for _ in range(scales - 1)])

        # Last 1x1 conv layer to recombine the outputs
        self.conv3 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels)

        # ReLU activation
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        # Initial 1x1 convolution to transform input channels
        out = self.relu(self.bn1(self.conv1(x)))

        # Split the feature map into several scales
        xs = torch.split(out, out.size(1) // self.scales, dim=1)

        # Process each scale independently
        for i in range(1, self.scales):
            if i == 1:
                output = xs[i]
            else:
                output = output + xs[i]
            output = self.relu(self.bns[i-1](self.convs[i-1](output)))

        # Concatenate all the scales back together
        out = torch.cat((xs[0], output), dim=1)

        # Final 1x1 convolution to recombine the features
        out = self.relu(self.bn3(self.conv3(out)))

        # Apply max pooling to reduce the spatial size
        out = self.pool(out)
        return out

class AudioSpoofingRes2Net(nn.Module):
    def __init__(self):
        super(AudioSpoofingRes2Net, self).__init__()

        self.mfcc_res2net = Res2NetBlock(in_channels=1, out_channels=32, scales=4)

        self.cqt_res2net = Res2NetBlock(in_channels=1, out_channels=32, scales=4)

        self.fc1 = nn.Linear(115200,4096)
        self.fc2 = nn.Linear(4096, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, mfcc, cqt):
        # Forward pass for MFCC features
        mfcc_output = self.mfcc_res2net(mfcc)

        # Forward pass for CQT features
        cqt_output = self.cqt_res2net(cqt)

        # Flatten the outputs
        mfcc_output = mfcc_output.view(mfcc_output.size(0), -1)
        cqt_output = cqt_output.view(cqt_output.size(0), -1)

        # Concatenate the flattened outputs
        combined = torch.cat((mfcc_output, cqt_output), dim=1)

        # Pass through fully connected layers
        x = F.relu(self.fc1(combined))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Sigmoid applied here for binary classification

        return x


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AudioSpoofingRes2Net().to(device)
model.load_state_dict(torch.load('25epoch.pth', map_location=device))
model.eval()
print(device)

  model.load_state_dict(torch.load('25epoch.pth', map_location=device))


cuda


In [9]:
def predict_single(audio_path):
    mfcc, cqt = extract_features(audio_path)
    
    mfcc = np.expand_dims(mfcc, axis=0)  
    mfcc = np.expand_dims(mfcc, axis=1) 
    
    cqt = np.expand_dims(cqt, axis=0)    
    cqt = np.expand_dims(cqt, axis=1)   
    
    mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).to(device)
    cqt_tensor = torch.tensor(cqt, dtype=torch.float32).to(device)
    
    with torch.no_grad():
        output = model(mfcc_tensor, cqt_tensor)
        prediction = (output.squeeze() >= 0.5).float().item()  # 1 for spoof, 0 for bonafide
    
    return "Spoof" if prediction == 1 else "Bonafide"

# Test on a single audio file
audio_path = 'fake.mp3'
result = predict_single(audio_path)
print(f"The audio file is classified as: {result}")

The audio file is classified as: Spoof
