In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multilingual-indian-speech-data/metadata/.kagglekeep
/kaggle/input/multilingual-indian-speech-data/metadata/train.csv
/kaggle/input/multilingual-indian-speech-data/metadata/test.csv
/kaggle/input/multilingual-indian-speech-data/audio/BRX_F_NAMES_01687.wav
/kaggle/input/multilingual-indian-speech-data/audio/MAR_F_SURPRISE_00047.wav
/kaggle/input/multilingual-indian-speech-data/audio/train_gujaratimale_02656.wav
/kaggle/input/multilingual-indian-speech-data/audio/hi_m_general_02964.wav
/kaggle/input/multilingual-indian-speech-data/audio/KAN_M_UMANG_00037.wav
/kaggle/input/multilingual-indian-speech-data/audio/te_f_education_02449.wav
/kaggle/input/multilingual-indian-speech-data/audio/train_gujaratimale_02870.wav
/kaggle/input/multilingual-indian-speech-data/audio/BRX_F_FEAR_00259.wav
/kaggle/input/multilingual-indian-speech-data/audio/en_f_EN_O_449.wav
/kaggle/input/multilingual-indian-speech-data/audio/hi_m_books_03304.wav
/kaggle/input/multilingual-indian-speech-data/aud

In [None]:
import os, glob, random
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import timm
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('/kaggle/input/multilingual-indian-speech-data/metadata/train.csv')
test_df = pd.read_csv('/kaggle/input/multilingual-indian-speech-data/metadata/test.csv')

In [None]:
train_df['audio_path'] = '/kaggle/input/multilingual-indian-speech-data/audio/' + train_df['id'] + '.wav'
test_df['audio_path'] = '/kaggle/input/multilingual-indian-speech-data/audio/' + test_df['id'] + '.wav'

In [None]:
train_df[['id', 'language', 'is_tts','audio_path']].head()

Unnamed: 0,id,language,is_tts,audio_path
0,ASM_F_ANGER_00342,Assamese,1,/kaggle/input/multilingual-indian-speech-data/...
1,ASM_F_ANGER_00077,Assamese,0,/kaggle/input/multilingual-indian-speech-data/...
2,ASM_F_ANGER_00216,Assamese,0,/kaggle/input/multilingual-indian-speech-data/...
3,ASM_F_ANGER_00069,Assamese,1,/kaggle/input/multilingual-indian-speech-data/...
4,ASM_F_ANGER_00445,Assamese,0,/kaggle/input/multilingual-indian-speech-data/...


In [None]:
test_df.head()

Unnamed: 0,text,id,language,is_tts,audio_path
0,মই যিমান পাৰো চেষ্টা কৰি আছোঁ আৰু আপোনাৰ পদ্ধত...,ASM_F_ANGER_00109,Assamese,-1,/kaggle/input/multilingual-indian-speech-data/...
1,শুনকচোন ছাৰ! মই ইয়াত দহ মিনিট ধৰি বাট চাই আছো...,ASM_F_ANGER_00127,Assamese,-1,/kaggle/input/multilingual-indian-speech-data/...
2,মই আপোনালোকৰ পৰা বহুত লেপটপ কিনোঁ। মই এয়া অন্...,ASM_F_ANGER_00386,Assamese,-1,/kaggle/input/multilingual-indian-speech-data/...
3,আমি আপোনাৰ বৰ্তমানৰ উপাৰ্জনৰ স্থিতি গ্ৰহণযোগ্য...,ASM_F_ANGER_00103,Assamese,-1,/kaggle/input/multilingual-indian-speech-data/...
4,সি ইমান বেছি চিঞৰ বাখৰ কৰিছিল যে মই দিনটোত কৰি...,ASM_F_ANGER_00434,Assamese,-1,/kaggle/input/multilingual-indian-speech-data/...


In [None]:
class FakeVoiceDataset(Dataset):
    def __init__(self, file_paths, labels, sr=16000, n_mels=128, duration=None, transform=None):
        """
        Args:
            file_paths (list): List of audio file paths.
            labels (list): Corresponding labels (e.g., 0: real, 1: fake).
            sr (int): Target sample rate.
            n_mels (int): Number of Mel bands.
            duration (int or None): If set, audio is padded/truncated to this many seconds.
                                    If None, the entire audio file is used.
            transform: Optional transform to be applied on the Mel spectrogram.
        """
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.n_mels = n_mels
        self.duration = duration
        self.transform = transform

        if self.duration is not None:
            self.samples = int(sr * duration)
        else:
            self.samples = None

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        y, _ = librosa.load(file_path, sr=self.sr)

        if self.samples is not None:
            if len(y) < self.samples:
                y = np.pad(y, (0, self.samples - len(y)), mode='constant')
            else:
                y = y[:self.samples]

        mel_spec = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        mel_spec = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min())

        mel_spec = np.stack([mel_spec, mel_spec, mel_spec], axis=0)

        mel_spec = torch.tensor(mel_spec, dtype=torch.float)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        if self.transform:
            mel_spec = self.transform(mel_spec)

        return mel_spec, label

In [None]:
file_paths = train_df['audio_path'].tolist()
labels = train_df['is_tts'].tolist()

In [None]:
resize_transform = transforms.Compose([
    transforms.Resize((224, 224))
])

In [None]:
dataset = FakeVoiceDataset(file_paths, labels, sr=16000, n_mels=128, duration=None, transform=resize_transform)

In [None]:
num_samples = len(dataset)
num_train = int(0.9 * num_samples)
num_val = num_samples - num_train
train_dataset, val_dataset = random_split(dataset, [num_train, num_val])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [None]:
model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=2)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / num_train

    # Validation
    model.eval()
    all_targets = []
    all_preds = []
    with torch.no_grad():
        for inputs, targets in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            probabilities = torch.softmax(outputs, dim=1)[:, 1]
            all_targets.extend(targets.cpu().numpy())
            all_preds.extend(probabilities.cpu().numpy())
    auc = roc_auc_score(all_targets, all_preds)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Val AUC-ROC: {auc:.4f}")

Epoch 1/15 - Training: 100%|██████████| 875/875 [19:40<00:00,  1.35s/it]
Epoch 1/15 - Validation: 100%|██████████| 98/98 [01:29<00:00,  1.10it/s]


Epoch 1/15, Loss: 0.2281, Val AUC-ROC: 0.9995


Epoch 2/15 - Training: 100%|██████████| 875/875 [15:30<00:00,  1.06s/it]
Epoch 2/15 - Validation: 100%|██████████| 98/98 [01:01<00:00,  1.60it/s]


Epoch 2/15, Loss: 0.0283, Val AUC-ROC: 0.9997


Epoch 3/15 - Training: 100%|██████████| 875/875 [15:23<00:00,  1.06s/it]
Epoch 3/15 - Validation: 100%|██████████| 98/98 [00:59<00:00,  1.64it/s]


Epoch 3/15, Loss: 0.0285, Val AUC-ROC: 0.9999


Epoch 4/15 - Training: 100%|██████████| 875/875 [15:18<00:00,  1.05s/it]
Epoch 4/15 - Validation: 100%|██████████| 98/98 [00:59<00:00,  1.66it/s]


Epoch 4/15, Loss: 0.0193, Val AUC-ROC: 1.0000


Epoch 5/15 - Training: 100%|██████████| 875/875 [15:33<00:00,  1.07s/it]
Epoch 5/15 - Validation: 100%|██████████| 98/98 [01:00<00:00,  1.61it/s]


Epoch 5/15, Loss: 0.0157, Val AUC-ROC: 1.0000


Epoch 6/15 - Training: 100%|██████████| 875/875 [15:17<00:00,  1.05s/it]
Epoch 6/15 - Validation: 100%|██████████| 98/98 [00:58<00:00,  1.68it/s]


Epoch 6/15, Loss: 0.0160, Val AUC-ROC: 0.9999


Epoch 7/15 - Training: 100%|██████████| 875/875 [15:10<00:00,  1.04s/it]
Epoch 7/15 - Validation: 100%|██████████| 98/98 [00:59<00:00,  1.65it/s]


Epoch 7/15, Loss: 0.0119, Val AUC-ROC: 1.0000


Epoch 8/15 - Training: 100%|██████████| 875/875 [15:18<00:00,  1.05s/it]
Epoch 8/15 - Validation: 100%|██████████| 98/98 [00:58<00:00,  1.67it/s]


Epoch 8/15, Loss: 0.0112, Val AUC-ROC: 0.9996


Epoch 9/15 - Training: 100%|██████████| 875/875 [15:15<00:00,  1.05s/it]
Epoch 9/15 - Validation: 100%|██████████| 98/98 [00:56<00:00,  1.73it/s]


Epoch 9/15, Loss: 0.0165, Val AUC-ROC: 1.0000


Epoch 10/15 - Training: 100%|██████████| 875/875 [15:01<00:00,  1.03s/it]
Epoch 10/15 - Validation: 100%|██████████| 98/98 [00:58<00:00,  1.67it/s]


Epoch 10/15, Loss: 0.0096, Val AUC-ROC: 1.0000


Epoch 11/15 - Training: 100%|██████████| 875/875 [15:15<00:00,  1.05s/it]
Epoch 11/15 - Validation: 100%|██████████| 98/98 [00:59<00:00,  1.65it/s]


Epoch 11/15, Loss: 0.0054, Val AUC-ROC: 0.9999


Epoch 12/15 - Training: 100%|██████████| 875/875 [15:22<00:00,  1.05s/it]
Epoch 12/15 - Validation: 100%|██████████| 98/98 [00:58<00:00,  1.67it/s]


Epoch 12/15, Loss: 0.0100, Val AUC-ROC: 1.0000


Epoch 13/15 - Training: 100%|██████████| 875/875 [15:21<00:00,  1.05s/it]
Epoch 13/15 - Validation: 100%|██████████| 98/98 [01:00<00:00,  1.63it/s]


Epoch 13/15, Loss: 0.0105, Val AUC-ROC: 0.9995


Epoch 14/15 - Training: 100%|██████████| 875/875 [15:26<00:00,  1.06s/it]
Epoch 14/15 - Validation: 100%|██████████| 98/98 [01:00<00:00,  1.62it/s]


Epoch 14/15, Loss: 0.0083, Val AUC-ROC: 1.0000


Epoch 15/15 - Training: 100%|██████████| 875/875 [15:25<00:00,  1.06s/it]
Epoch 15/15 - Validation: 100%|██████████| 98/98 [01:00<00:00,  1.63it/s]

Epoch 15/15, Loss: 0.0058, Val AUC-ROC: 1.0000





In [None]:
class TestVoiceDataset(Dataset):
    def __init__(self, df, sr=16000, n_mels=128, duration=3, transform=None):
        """
        Args:
            df (pd.DataFrame): DataFrame containing at least 'id' and 'audio_path' columns.
            sr (int): Sample rate.
            n_mels (int): Number of mel bands.
            duration (int or None): Fixed duration (in seconds) to pad/truncate audio. Set to None to use full audio.
            transform: Optional transform to apply on the mel spectrogram.
        """
        self.df = df
        self.sr = sr
        self.n_mels = n_mels
        self.duration = duration
        self.transform = transform
        if self.duration is not None:
            self.samples = int(sr * duration)
        else:
            self.samples = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = row['audio_path']
        sample_id = row['id']

        # Load audio
        y, _ = librosa.load(audio_path, sr=self.sr)

        # Pad or truncate audio if a fixed duration is specified
        if self.samples is not None:
            if len(y) < self.samples:
                y = np.pad(y, (0, self.samples - len(y)), mode='constant')
            else:
                y = y[:self.samples]

        # Compute Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize to [0, 1]
        mel_spec = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min())

        # Stack to create 3 channels
        mel_spec = np.stack([mel_spec, mel_spec, mel_spec], axis=0)  # Shape: (3, n_mels, time)
        mel_spec = torch.tensor(mel_spec, dtype=torch.float)

        if self.transform:
            mel_spec = self.transform(mel_spec)

        return mel_spec, sample_id


In [None]:
torch.save(model.state_dict(), "fake_voice_vit.pth")

class TestVoiceDataset(Dataset):
    def __init__(self, df, sr=16000, n_mels=128, duration=3, transform=None):
        """
        Args:
            df (pd.DataFrame): DataFrame containing at least 'id' and 'audio_path' columns.
            sr (int): Sample rate.
            n_mels (int): Number of mel bands.
            duration (int or None): Fixed duration (in seconds) to pad/truncate audio. Set to None to use full audio.
            transform: Optional transform to apply on the mel spectrogram.
        """
        self.df = df
        self.sr = sr
        self.n_mels = n_mels
        self.duration = duration
        self.transform = transform
        if self.duration is not None:
            self.samples = int(sr * duration)
        else:
            self.samples = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = row['audio_path']
        sample_id = row['id']

        # Load audio
        y, _ = librosa.load(audio_path, sr=self.sr)

        # Pad or truncate audio if a fixed duration is specified
        if self.samples is not None:
            if len(y) < self.samples:
                y = np.pad(y, (0, self.samples - len(y)), mode='constant')
            else:
                y = y[:self.samples]

        # Compute Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize to [0, 1]
        mel_spec = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min())

        # Stack to create 3 channels
        mel_spec = np.stack([mel_spec, mel_spec, mel_spec], axis=0)  # Shape: (3, n_mels, time)
        mel_spec = torch.tensor(mel_spec, dtype=torch.float)

        if self.transform:
            mel_spec = self.transform(mel_spec)

        return mel_spec, sample_id


# test_csv_path = '/kaggle/input/multilingual-indian-speech-data/metadata/test.csv'
# test_df = pd.read_csv(test_csv_path)

test_dataset = TestVoiceDataset(test_df, sr=16000, n_mels=128, duration=None, transform=resize_transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)

all_ids = []
all_preds = []

with torch.no_grad():
    for inputs, ids in tqdm(test_loader, desc="Inference"):
        inputs = inputs.to(device)
        outputs = model(inputs)
        # For binary classification, we take the probability of class 1 (is_tts)
        probs = torch.softmax(outputs, dim=1)[:, 1]
        all_preds.extend(probs.cpu().numpy())
        all_ids.extend(ids)

# ------------------------------------------
# Create Submission DataFrame and Save to CSV
# ------------------------------------------
submission_df = pd.DataFrame({'id': all_ids, 'is_tts': all_preds})
# Optionally sort submission_df by id if needed:
submission_df = submission_df.sort_values('id')
submission_df.to_csv("submission.csv", index=False)

print("Inference complete and submission.csv saved!")

Inference: 100%|██████████| 165/165 [01:12<00:00,  2.27it/s]

Inference complete and submission.csv saved!



