In [1]:
import os

print(os.listdir("/kaggle/input"))

['depressionintamil']


In [2]:
import os

def print_tree(start_path, indent=""):
    for item in sorted(os.listdir(start_path)):
        path = os.path.join(start_path, item)
        print(indent + "|-- " + item)
        if os.path.isdir(path):
            print_tree(path, indent + "    ")

DATASET_PATH = "/kaggle/input/depressionintamil"  # change if needed
#print_tree(DATASET_PATH)


In [3]:
BASE_PATH = "/kaggle/input/depressionintamil"
import os
from glob import glob

TAMIL_TRAIN_PATH = os.path.join(BASE_PATH, "Tamil")
TEST_PATH = os.path.join(BASE_PATH, "Test-set-tamil", "Test-set-tamil")


In [4]:
train_files = []
train_labels = []

# Depressed
depressed_files = glob(
    os.path.join(TAMIL_TRAIN_PATH, "Depressed", "Train_set", "*.wav")
)
train_files.extend(depressed_files)
train_labels.extend([1] * len(depressed_files))

# Non-depressed
non_depressed_files = glob(
    os.path.join(TAMIL_TRAIN_PATH, "Non-depressed", "Train_set", "*.wav")
)
train_files.extend(non_depressed_files)
train_labels.extend([0] * len(non_depressed_files))

print("Training samples:", len(train_files))
print("Depressed samples:", len(depressed_files))
print("Non-depressed samples:", len(non_depressed_files))


Training samples: 1374
Depressed samples: 454
Non-depressed samples: 920


In [5]:
test_files = sorted(glob(os.path.join(TEST_PATH, "*.wav")))

print("Test samples:", len(test_files))
print("Example test file:", test_files[0])
print("\nSample training files:")
for i in range(3):
    print(train_files[i], "→ label:", train_labels[i])


Test samples: 160
Example test file: /kaggle/input/depressionintamil/Test-set-tamil/Test-set-tamil/t1.wav

Sample training files:
/kaggle/input/depressionintamil/Tamil/Depressed/Train_set/D_S00_22-2.wav → label: 1
/kaggle/input/depressionintamil/Tamil/Depressed/Train_set/D_S00_10-4.wav → label: 1
/kaggle/input/depressionintamil/Tamil/Depressed/Train_set/D_S00_38-2.wav → label: 1


In [6]:
import pandas as pd

train_df = pd.DataFrame({
    "path": train_files,
    "label": train_labels
})

test_df = pd.DataFrame({
    "path": test_files
})

train_df.head()

Unnamed: 0,path,label
0,/kaggle/input/depressionintamil/Tamil/Depresse...,1
1,/kaggle/input/depressionintamil/Tamil/Depresse...,1
2,/kaggle/input/depressionintamil/Tamil/Depresse...,1
3,/kaggle/input/depressionintamil/Tamil/Depresse...,1
4,/kaggle/input/depressionintamil/Tamil/Depresse...,1


**First Model**

In [7]:
import os
import re
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import librosa
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Model
)

from sklearn.metrics import f1_score
from sklearn.model_selection import GroupShuffleSplit
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


2026-01-28 08:13:25.106295: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769588005.301551      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769588005.357874      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769588005.844445      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769588005.844485      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769588005.844488      55 computation_placer.cc:177] computation placer alr

Device: cuda


In [8]:
DATA_ROOT = "/kaggle/input/depressionintamil"

DEPRESSED_DIR = os.path.join(DATA_ROOT, "Tamil/Depressed/Train_set")
NON_DEPRESSED_DIR = os.path.join(DATA_ROOT, "Tamil/Non-depressed/Train_set")

train_files = []
train_labels = []

for f in os.listdir(DEPRESSED_DIR):
    if f.endswith(".wav"):
        train_files.append(os.path.join(DEPRESSED_DIR, f))
        train_labels.append(1)

for f in os.listdir(NON_DEPRESSED_DIR):
    if f.endswith(".wav"):
        train_files.append(os.path.join(NON_DEPRESSED_DIR, f))
        train_labels.append(0)

print("Total files:", len(train_files))


Total files: 1374


In [9]:
def get_speaker_id(path):
    fname = os.path.basename(path)
    match = re.search(r'(A\d+|F\d+|S\d+|ND\d+)', fname)
    return match.group(1) if match else fname
def safe_speaker_split(files, labels, test_size=0.2):
    groups = [get_speaker_id(f) for f in files]

    for seed in range(100):
        gss = GroupShuffleSplit(
            n_splits=1, test_size=test_size, random_state=seed
        )
        train_idx, val_idx = next(gss.split(files, labels, groups))

        y_tr = [labels[i] for i in train_idx]
        y_va = [labels[i] for i in val_idx]

        if len(set(y_tr)) == 2 and len(set(y_va)) == 2:
            print(f"✅ Valid split found (seed={seed})")
            print("Train:", Counter(y_tr))
            print("Val:", Counter(y_va))
            return train_idx, val_idx

    raise RuntimeError("❌ Could not find valid speaker split")


In [10]:
train_idx, val_idx = safe_speaker_split(train_files, train_labels)

X_train = [train_files[i] for i in train_idx]
y_train = [train_labels[i] for i in train_idx]

X_val = [train_files[i] for i in val_idx]
y_val = [train_labels[i] for i in val_idx]


✅ Valid split found (seed=0)
Train: Counter({0: 736, 1: 423})
Val: Counter({0: 184, 1: 31})


In [11]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-xls-r-300m"
)


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [12]:
class AudioDataset(Dataset):
    def __init__(self, files, labels):
        self.files = files
        self.labels = labels

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio, _ = librosa.load(self.files[idx], sr=16000)

        return {
            "audio": audio,
            "label": self.labels[idx]
        }
def collate_fn(batch):
    audios = [item["audio"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)

    inputs = feature_extractor(
        audios,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    return {
        "input_values": inputs.input_values,
        "labels": labels
    }
train_loader = DataLoader(
    AudioDataset(X_train, y_train),
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    AudioDataset(X_val, y_val),
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn
)



In [13]:
class XLSRClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Wav2Vec2Model.from_pretrained(
            "facebook/wav2vec2-xls-r-300m"
        )
        self.classifier = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)
        )

    def forward(self, input_values):
        outputs = self.encoder(input_values)
        pooled = outputs.last_hidden_state.mean(dim=1)
        return self.classifier(pooled)


In [14]:
model = XLSRClassifier().to(device)

for param in model.encoder.parameters():
    param.requires_grad = False


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

In [15]:
class_counts = np.bincount(y_train, minlength=2)

weights = torch.tensor(
    class_counts.sum() / (2 * class_counts),
    device=device,
    dtype=torch.float
)

criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=1e-3)


In [16]:
from sklearn.metrics import accuracy_score, f1_score

EPOCHS = 6

for epoch in range(EPOCHS):
    # ---------- TRAIN ----------
    model.train()
    total_loss = 0
    train_preds, train_trues = [], []

    for batch in train_loader:
        optimizer.zero_grad()

        inputs = batch["input_values"].to(device)
        labels = batch["labels"].to(device)

        logits = model(inputs)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_trues.extend(labels.cpu().numpy())

    train_loss = total_loss / len(train_loader)
    train_acc = accuracy_score(train_trues, train_preds)

    # ---------- VALIDATION ----------
    model.eval()
    val_preds, val_trues = [], []

    with torch.no_grad():
        for batch in val_loader:
            inputs = batch["input_values"].to(device)
            labels = batch["labels"].to(device)

            logits = model(inputs)
            preds = torch.argmax(logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_trues.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_trues, val_preds)
    val_f1 = f1_score(val_trues, val_preds, average="macro")

    # ---------- LOG ----------
    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Train Acc: {train_acc:.4f} | "
        f"Val Acc: {val_acc:.4f} | "
        f"Val Macro-F1: {val_f1:.4f}"
    )


model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Epoch 1/6 | Train Loss: 0.4721 | Train Acc: 0.7748 | Val Acc: 0.8093 | Val Macro-F1: 0.7383
Epoch 2/6 | Train Loss: 0.2065 | Train Acc: 0.9180 | Val Acc: 0.9116 | Val Macro-F1: 0.8555
Epoch 3/6 | Train Loss: 0.2123 | Train Acc: 0.9137 | Val Acc: 0.8884 | Val Macro-F1: 0.8256
Epoch 4/6 | Train Loss: 0.1344 | Train Acc: 0.9491 | Val Acc: 0.8605 | Val Macro-F1: 0.7926
Epoch 5/6 | Train Loss: 0.1053 | Train Acc: 0.9620 | Val Acc: 0.9721 | Val Macro-F1: 0.9384
Epoch 6/6 | Train Loss: 0.1700 | Train Acc: 0.9370 | Val Acc: 0.9907 | Val Macro-F1: 0.9812


In [17]:
SAVE_PATH = "/kaggle/working/xlsr_tamil_final.pt"

torch.save(
    {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epochs": EPOCHS
    },
    SAVE_PATH
)

print(f"✅ Model saved after {EPOCHS} epochs at {SAVE_PATH}")


✅ Model saved after 6 epochs at /kaggle/working/xlsr_tamil_final.pt


In [18]:
checkpoint = torch.load("/kaggle/working/xlsr_tamil_final.pt", map_location=device)

model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

print("✅ Model loaded successfully")


✅ Model loaded successfully


In [20]:
TEST_DIR = "/kaggle/input/depressionintamil/Test-set-tamil/Test-set-tamil"

test_files = sorted([
    os.path.join(TEST_DIR, f)
    for f in os.listdir(TEST_DIR)
    if f.endswith(".wav")
])

print("Total test files:", len(test_files))


Total test files: 160


In [21]:
class TestAudioDataset(Dataset):
    def __init__(self, files):
        self.files = files

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio, _ = librosa.load(self.files[idx], sr=16000)
        return {
            "audio": audio,
            "file": os.path.basename(self.files[idx])
        }
def test_collate_fn(batch):
    audios = [item["audio"] for item in batch]
    files = [item["file"] for item in batch]

    inputs = feature_extractor(
        audios,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    return {
        "input_values": inputs.input_values,
        "files": files
    }
test_loader = DataLoader(
    TestAudioDataset(test_files),
    batch_size=4,
    shuffle=False,
    collate_fn=test_collate_fn
)


In [22]:
test_results = []

with torch.no_grad():
    for batch in test_loader:
        inputs = batch["input_values"].to(device)
        files = batch["files"]

        logits = model(inputs)
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        for f, p in zip(files, preds):
            label = "Depressed" if p == 1 else "Non-depressed"
            test_results.append((f, label))


In [23]:
for i in range(10):
    print(test_results[i])


('t1.wav', 'Depressed')
('t10.wav', 'Non-depressed')
('t100.wav', 'Depressed')
('t101.wav', 'Depressed')
('t102.wav', 'Depressed')
('t103.wav', 'Depressed')
('t104.wav', 'Depressed')
('t105.wav', 'Non-depressed')
('t106.wav', 'Depressed')
('t107.wav', 'Non-depressed')


In [None]:
import pandas as pd

submission = pd.DataFrame(
    test_results,
    columns=["file", "prediction"]
)

submission_path = "/kaggle/working/tamil_test_predictions.csv"
submission.to_csv(submission_path, index=False)

print("✅ Predictions saved at:", submission_path)