In [5]:
!pip install torchvision



In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import ParameterGrid
import torchvision
from IPython.display import Audio
from torchvision.models import ResNet18_Weights


In [2]:
from utils import download_asvspoof2019_data

path = download_asvspoof2019_data()



Downloading from https://www.kaggle.com/api/v1/datasets/download/awsaf49/asvpoof-2019-dataset?dataset_version_number=1...


  3%|▎         | 706M/23.6G [02:03<1:08:34, 5.97MB/s] 

KeyboardInterrupt



In [12]:
from pydub import AudioSegment
from pydub.playback import play

# Load the .flac file
audio_file = "/Users/roeeseren/Documents/semester-seven/deep-learning/project/asvspoof/LA/LA/ASVspoof2019_LA_train/flac/LA_T_1004407.flac"
audio = AudioSegment.from_file(audio_file, format="flac")

audio

In [23]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

def audio_to_mel_spectrogram(audio_path, n_fft=400, hop_length=160, n_mels=64):
    """
    Convert an audio file to a mel-spectrogram using librosa.
    Args:
        audio_path (str): Path to the audio file.
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.
        n_mels (int): Number of mel filterbanks.
    Returns:
        np.ndarray: Mel-spectrogram as a numpy array.
    """
    # Load audio file
    waveform, sample_rate = librosa.load(audio_path, sr=None)

    # Generate mel-spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(
        y=waveform,
        sr=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )

    # Convert to log scale (dB)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Normalize to [0, 1]
    log_mel_spectrogram = (log_mel_spectrogram - log_mel_spectrogram.min()) / (log_mel_spectrogram.max() - log_mel_spectrogram.min())

    return log_mel_spectrogram

# Example usage
audio_path = audio_file
mel_spectrogram = audio_to_mel_spectrogram(audio_path)

# Visualize the mel-spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel_spectrogram, x_axis='time', y_axis='mel', sr=16000, hop_length=160, cmap='viridis')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram')
plt.show()

In [45]:
# Load train data
from utils import load_protocol
data_path = "/Users/roeeseren/Documents/semester-seven/deep-learning/project/asvspoof"
data_info_file_path = f"{data_path}/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"

train_df_info = load_protocol(data_info_file_path, names=['speaker', 'file_name', 'attack', 'label'])

train_df_info.head()

Unnamed: 0,speaker,file_name,attack,label
0,LA_0079,LA_T_1138215,-,bonafide
1,LA_0079,LA_T_1271820,-,bonafide
2,LA_0079,LA_T_1272637,-,bonafide
3,LA_0079,LA_T_1276960,-,bonafide
4,LA_0079,LA_T_1341447,-,bonafide


In [46]:
def process_data_frame(unprocessed_df):
    mapping = {
      'bonafide': 0,
      'spoof': 1
    }
    df_processed = unprocessed_df[['file_name', 'label']]
    df_processed['label'] = df_processed['label'].map(mapping)

    print(df_processed.head())
    return df_processed

train_df_info = process_data_frame(train_df_info)

      file_name  label
0  LA_T_1138215      0
1  LA_T_1271820      0
2  LA_T_1272637      0
3  LA_T_1276960      0
4  LA_T_1341447      0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed['label'] = df_processed['label'].map(mapping)


In [52]:
def standardize_spectrogram(spectrogram, fixed_time_steps=192):
    """
    Truncate or pad the spectrogram to a fixed time dimension.
    Args:
        spectrogram (np.ndarray): Input spectrogram of shape (n_mels, time_steps).
        fixed_time_steps (int): Desired time dimension.
    Returns:
        np.ndarray: Spectrogram with shape (n_mels, fixed_time_steps).
    """
    n_mels, time_steps = spectrogram.shape

    # If the spectrogram is longer than fixed_time_steps, truncate it
    if time_steps > fixed_time_steps:
        spectrogram = spectrogram[:, :fixed_time_steps]
    # If the spectrogram is shorter, pad it with zeros
    elif time_steps < fixed_time_steps:
        pad_width = ((0, 0), (0, fixed_time_steps - time_steps))
        spectrogram = np.pad(spectrogram, pad_width, mode='constant')

    return spectrogram

In [51]:
from scipy.ndimage import zoom

def resize_spectrogram(spectrogram, fixed_time_steps=192):
    """
    Resize the spectrogram to a fixed time dimension using interpolation.
    Args:
        spectrogram (np.ndarray): Input spectrogram of shape (n_mels, time_steps).
        fixed_time_steps (int): Desired time dimension.
    Returns:
        np.ndarray: Resized spectrogram with shape (n_mels, fixed_time_steps).
    """
    n_mels, time_steps = spectrogram.shape

    # Calculate the zoom factor for the time dimension
    zoom_factor = (1, fixed_time_steps / time_steps)

    # Resize the spectrogram using interpolation
    resized_spectrogram = zoom(spectrogram, zoom_factor)

    return resized_spectrogram

In [59]:
from torch.utils.data import Dataset, DataLoader
import os

class ASVSpoofDataset(Dataset):
    def __init__(self, audio_dir, data_info_df, fixed_time_steps=192):
        """
        Args:
            audio_dir (str): Directory with all the audio files.
        """
        self.audio_dir = audio_dir
        self.audio_files = os.listdir(audio_dir)
        self.data_info_df = data_info_df
        self.fixed_time_steps = fixed_time_steps

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        current_audio_path = os.path.join(self.audio_dir, self.audio_files[idx])
        spectrogram = audio_to_mel_spectrogram(current_audio_path)
        
        # Resize the spectrogram
        mel_spectrogram = resize_spectrogram(spectrogram, self.fixed_time_steps)

        # Convert to tensor and add channel dimension
        mel_spectrogram = torch.tensor(mel_spectrogram, dtype=torch.float32).unsqueeze(0)

        # Get label (0 for real, 1 for spoofed)
        label = self.data_info_df.iloc[idx]['label']

        return mel_spectrogram, label

In [26]:
BATCH_SIZE = 32

In [60]:
# Example usage
train_dir = "/Users/roeeseren/Documents/semester-seven/deep-learning/project/asvspoof/LA/LA/ASVspoof2019_LA_train/flac"
train_dataset = ASVSpoofDataset(train_dir, train_df_info)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [61]:
train_dataset[0]

(tensor([[[0.4005, 0.4670, 0.4422,  ..., 0.3595, 0.4799, 0.5227],
          [0.4059, 0.5477, 0.5402,  ..., 0.3714, 0.5183, 0.5404],
          [0.3663, 0.5290, 0.5378,  ..., 0.3381, 0.4478, 0.5149],
          ...,
          [0.0071, 0.0898, 0.0835,  ..., 0.1652, 0.1071, 0.0807],
          [0.0218, 0.0647, 0.0583,  ..., 0.1173, 0.1187, 0.0871],
          [0.0113, 0.0274, 0.0469,  ..., 0.0916, 0.0492, 0.0439]]]),
 np.int64(0))

In [49]:
import torchvision.models as models
from torchvision.models import ResNet18_Weights

# Load ResNet18 with pretrained weights
resnet18 = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)

# Modify the first layer to accept 1-channel input
resnet18.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Modify the final layer for binary classification
resnet18.fc = nn.Linear(resnet18.fc.in_features, 1)

# Add a sigmoid activation for binary output
resnet18 = nn.Sequential(resnet18, nn.Sigmoid())

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet18 = resnet18.to(device)

In [63]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(resnet18.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Number of epochs
    resnet18.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device).float()

        # Forward pass
        outputs = resnet18(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        running_loss += loss.item()

    # Print epoch statistics
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/10], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

Epoch [1/10], Loss: 0.3332, Accuracy: 89.81%
Epoch [2/10], Loss: 0.3335, Accuracy: 89.81%



KeyboardInterrupt



In [None]:
eval_info_file_path = f"{data_path}/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt"
eval_df_info = load_protocol(data_info_file_path, names=['speaker', 'file_name', 'attack', 'label'])

print(eval_df_info.head())
eval_df_info = process_data_frame(eval_df_info)