In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install -q torch torchaudio pandas numpy tqdm scikit-learn

import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indic-tts-deepfake-challenge/sample.csv


In [2]:
CONFIG = {
    'DATASET_NAME': "SherryT997/IndicTTS-Deepfake-Challenge-Data",
    'BATCH_SIZE': 64,  # Larger batch size since we're using CNN
    'NUM_EPOCHS': 10,
    'LEARNING_RATE': 3e-4,
    'N_MELS': 128,
    'MAX_LENGTH': 160000
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
class SpectrogramCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Mel Spectrogram transformer
        self.mel_spec = T.MelSpectrogram(
            sample_rate=16000,
            n_fft=2048,
            hop_length=512,
            n_mels=CONFIG['N_MELS']
        )
        
        # CNN layers
        self.features = nn.Sequential(
            # First block
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.2),
            
            # Second block
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),
            
            # Third block
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.4),
            
            # Fourth block
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.5),
        )
        
        # Adaptive pooling to handle variable length inputs
        self.adaptive_pool = nn.AdaptiveAvgPool2d((4, 4))
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(256 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        # Convert to mel spectrogram
        with torch.no_grad():
            x = self.mel_spec(x)
            # Convert to dB scale
            x = torchaudio.transforms.AmplitudeToDB()(x)
            # Add channel dimension
            x = x.unsqueeze(1)
        # CNN feature extraction
        x = self.features(x)
        # Adaptive pooling
        x = self.adaptive_pool(x)
        # Flatten
        x = x.view(x.size(0), -1)
        # Classification
        x = self.classifier(x)
        return x

In [4]:
class AudioDataset(Dataset):
    def __init__(self, split="train"):
        self.dataset = load_dataset(CONFIG['DATASET_NAME'], split=split)  # Removed streaming=True to load full dataset
        print(f"Loading {split} dataset with {len(self.dataset)} samples...")
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Load and process audio
        audio = torch.from_numpy(item['audio']['array']).float()
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = torch.mean(audio, dim=0)
        
        # Resample if needed
        if item['audio']['sampling_rate'] != 16000:
            resampler = T.Resample(item['audio']['sampling_rate'], 16000)
            audio = resampler(audio)
        
        # Normalize
        audio = audio / (torch.max(torch.abs(audio)) + 1e-8)
        
        # Pad/trim
        if audio.shape[0] < CONFIG['MAX_LENGTH']:
            audio = torch.nn.functional.pad(audio, (0, CONFIG['MAX_LENGTH'] - audio.shape[0]))
        else:
            audio = audio[:CONFIG['MAX_LENGTH']]
        
        return {
            'input': audio,
            'label': float(item['is_tts']) if item['is_tts'] != -1 else -1.0,
            'id': item['id']
        }

In [5]:
train_dataset = AudioDataset("train")
test_dataset = AudioDataset("test")

README.md:   0%|          | 0.00/2.81k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/35 [00:00<?, ?files/s]

train-00000-of-00035.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00001-of-00035.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00002-of-00035.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00003-of-00035.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00004-of-00035.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

train-00005-of-00035.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

train-00006-of-00035.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00007-of-00035.parquet:   0%|          | 0.00/516M [00:00<?, ?B/s]

train-00008-of-00035.parquet:   0%|          | 0.00/557M [00:00<?, ?B/s]

train-00009-of-00035.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

train-00010-of-00035.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00011-of-00035.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00012-of-00035.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00013-of-00035.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00014-of-00035.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00015-of-00035.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

train-00016-of-00035.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00017-of-00035.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00018-of-00035.parquet:   0%|          | 0.00/471M [00:00<?, ?B/s]

train-00019-of-00035.parquet:   0%|          | 0.00/501M [00:00<?, ?B/s]

train-00020-of-00035.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

train-00021-of-00035.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

train-00022-of-00035.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00023-of-00035.parquet:   0%|          | 0.00/599M [00:00<?, ?B/s]

train-00024-of-00035.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

train-00025-of-00035.parquet:   0%|          | 0.00/547M [00:00<?, ?B/s]

train-00026-of-00035.parquet:   0%|          | 0.00/537M [00:00<?, ?B/s]

train-00027-of-00035.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

train-00028-of-00035.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00029-of-00035.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00030-of-00035.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00031-of-00035.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00032-of-00035.parquet:   0%|          | 0.00/613M [00:00<?, ?B/s]

train-00033-of-00035.parquet:   0%|          | 0.00/309M [00:00<?, ?B/s]

train-00034-of-00035.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/356M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/364M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31102 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2635 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

Loading train dataset with 31102 samples...


Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Loading test dataset with 2635 samples...


In [6]:
# Create train/val split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

print(f"Total training samples: {len(train_subset)}")
print(f"Total validation samples: {len(val_subset)}")
print(f"Total test samples: {len(test_dataset)}")

Total training samples: 24881
Total validation samples: 6221
Total test samples: 2635


In [7]:
train_loader = DataLoader(train_subset, batch_size=CONFIG['BATCH_SIZE'], shuffle=True, num_workers=2)
val_loader = DataLoader(val_subset, batch_size=CONFIG['BATCH_SIZE'], shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['BATCH_SIZE'], shuffle=False, num_workers=2)

In [8]:
model = SpectrogramCNN().to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['LEARNING_RATE'])
criterion = nn.BCELoss()

In [9]:
best_val_auc = 0
for epoch in range(CONFIG['NUM_EPOCHS']):
    print(f"\nEpoch {epoch+1}/{CONFIG['NUM_EPOCHS']}")
    
    # Training
    model.train()
    train_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in tqdm(train_loader, desc="Training"):
        inputs = batch['input'].to(DEVICE)
        labels = batch['label'].float().to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        all_preds.extend(outputs.detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    train_auc = roc_auc_score(all_labels, all_preds)
    
    # Validation
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            inputs = batch['input'].to(DEVICE)
            labels = batch['label'].float().to(DEVICE)
            
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            all_preds.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_auc = roc_auc_score(all_labels, all_preds)
    
    print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train AUC: {train_auc:.4f}")
    print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val AUC: {val_auc:.4f}")
    
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Saved new best model with Val AUC: {val_auc:.4f}")


Epoch 1/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.3358, Train AUC: 0.9255
Val Loss: 0.5905, Val AUC: 0.9919
Saved new best model with Val AUC: 0.9919

Epoch 2/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0740, Train AUC: 0.9967
Val Loss: 0.9533, Val AUC: 0.9932
Saved new best model with Val AUC: 0.9932

Epoch 3/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0407, Train AUC: 0.9989
Val Loss: 0.6982, Val AUC: 0.9974
Saved new best model with Val AUC: 0.9974

Epoch 4/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0265, Train AUC: 0.9996
Val Loss: 2.2162, Val AUC: 0.9956

Epoch 5/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0189, Train AUC: 0.9997
Val Loss: 1.3740, Val AUC: 0.9991
Saved new best model with Val AUC: 0.9991

Epoch 6/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0181, Train AUC: 0.9998
Val Loss: 0.3182, Val AUC: 0.9998
Saved new best model with Val AUC: 0.9998

Epoch 7/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0140, Train AUC: 0.9999
Val Loss: 0.8021, Val AUC: 0.9982

Epoch 8/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0146, Train AUC: 0.9999
Val Loss: 1.4722, Val AUC: 0.9993

Epoch 9/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0128, Train AUC: 0.9999
Val Loss: 0.0584, Val AUC: 1.0000
Saved new best model with Val AUC: 1.0000

Epoch 10/10


Training:   0%|          | 0/389 [00:00<?, ?it/s]

Validating:   0%|          | 0/98 [00:00<?, ?it/s]

Train Loss: 0.0142, Train AUC: 0.9998
Val Loss: 0.3184, Val AUC: 1.0000


In [10]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

predictions = []
ids = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating predictions"):
        inputs = batch['input'].to(DEVICE)
        outputs = model(inputs).squeeze()
        predictions.extend(outputs.cpu().numpy())
        ids.extend(batch['id'])

Generating predictions:   0%|          | 0/42 [00:00<?, ?it/s]

In [11]:
submission_df = pd.DataFrame({'id': ids, 'is_tts': predictions})
submission_df.to_csv('submission.csv', index=False)
print("Done! Submission file created.")

Done! Submission file created.
