In [1]:
!pip uninstall torch torchvision torchaudio transformers -y

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Found existing installation: transformers 4.47.0
Uninstalling transformers-4.47.0:
  Successfully uninstalled transformers-4.47.0


In [33]:
!pip install torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1+cu121 --extra-index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.47.0

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu121


In [27]:
!pip install --upgrade huggingface_hub




In [25]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import WhisperFeatureExtractor, WhisperModel, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np


In [5]:
torch.cuda.empty_cache()

In [6]:
dataset = load_dataset("DynamicSuperb/StutteringDetection_SEP28k", split="test")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")


README.md:   0%|          | 0.00/431 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/91.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [7]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

def preprocess_function(batch):
    audio = batch["audio"]["array"]
    features = feature_extractor(audio, sampling_rate=16000, return_tensors="np").input_features[0]
    label = 1 if batch["label"] == "yes" else 0
    return {"input_features": features, "label": label}

dataset = dataset.map(preprocess_function, num_proc=1)


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
dataset_list = list(dataset)
train_list, test_list = train_test_split(dataset_list, test_size=0.2, random_state=42)
from datasets import Dataset
train_dataset = Dataset.from_list(train_list)
test_dataset = Dataset.from_list(test_list)


In [9]:
from torch.utils.data import Dataset

class StutteringDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return {
            "input_features": torch.tensor(self.dataset[idx]["input_features"], dtype=torch.float32),
            "label": torch.tensor(self.dataset[idx]["label"], dtype=torch.long),
        }

train_data = StutteringDataset(train_dataset)
test_data = StutteringDataset(test_dataset)


In [10]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_features = torch.stack([b["input_features"] for b in batch])
    labels = torch.tensor([b["label"] for b in batch], dtype=torch.long)
    return input_features, labels

train_dataloader = DataLoader(train_data, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=2, collate_fn=collate_fn)


In [11]:
import torch.nn as nn
from transformers import WhisperModel

class WhisperForStutteringClassification(nn.Module):
    def __init__(self, model_name="openai/whisper-small", num_labels=2):
        super().__init__()
        self.whisper = WhisperModel.from_pretrained(model_name)
        self.whisper.encoder.gradient_checkpointing = False
        self.classifier = nn.Linear(self.whisper.config.d_model, num_labels)

    def forward(self, input_features):
        encoder_outputs = self.whisper.encoder(input_features).last_hidden_state
        pooled_output = encoder_outputs.mean(dim=1)
        logits = self.classifier(pooled_output)
        return logits


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WhisperForStutteringClassification().to(device)
model.whisper.encoder.gradient_checkpointing = False

if torch.cuda.is_available():
    print(f"Total GPUs Available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU found!")


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Total GPUs Available: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


In [13]:
import torch
torch.cuda.empty_cache()


In [14]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()


In [20]:
import torch
from torch.cuda.amp import autocast, GradScaler
import os
save_path = "/kaggle/working/"
os.makedirs(save_path, exist_ok=True)

gradient_accumulation_steps = 4
scaler = GradScaler()
num_epochs = 12

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(train_dataloader):
        input_features, labels = batch
        input_features, labels = input_features.to(device), labels.to(device)

        with autocast():
            logits = model(input_features)
            loss = criterion(logits, labels)

        scaler.scale(loss / gradient_accumulation_steps).backward()

        if (i + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model_save_path = os.path.join(save_path, "last_model.pth")
    torch.save(model.state_dict(), model_save_path)
    print(f"✅ Model saved at: {model_save_path}")


  scaler = GradScaler()
  with autocast():


Epoch 1, Loss: 248.4799
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 2, Loss: 117.9714
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 3, Loss: 31.6812
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 4, Loss: 15.2362
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 5, Loss: 1.0560
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 6, Loss: 0.0674
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 7, Loss: 0.0351
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 8, Loss: 0.0256
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 9, Loss: 0.0199
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 10, Loss: 0.0160
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 11, Loss: 0.0131
✅ Model saved at: /kaggle/working/last_model.pth
Epoch 12, Loss: 0.0109
✅ Model saved at: /kaggle/working/last_model.pth


In [17]:
import torch
from torch.cuda.amp import autocast, GradScaler
import os

save_path = "/kaggle/working/"
os.makedirs(save_path, exist_ok=True)

gradient_accumulation_steps = 4
scaler = GradScaler()
num_epochs = 12

for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(train_dataloader):
        input_features, labels = batch
        input_features, labels = input_features.to(device), labels.to(device)

        with autocast():
            logits = model(input_features)
            loss = criterion(logits, labels)

        scaler.scale(loss / gradient_accumulation_steps).backward()

        if (i + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    model_save_path = os.path.join(save_path, f"model_epoch_{epoch}.pth")
    torch.save(model.state_dict(), model_save_path)
    print(f"✅ Model saved at: {model_save_path}")


  scaler = GradScaler()
  with autocast():


Epoch 1, Loss: 8.2591
✅ Model saved at: /kaggle/working/model_epoch_1.pth
Epoch 2, Loss: 8.9469
✅ Model saved at: /kaggle/working/model_epoch_2.pth
Epoch 3, Loss: 2.1061
✅ Model saved at: /kaggle/working/model_epoch_3.pth
Epoch 4, Loss: 13.4174
✅ Model saved at: /kaggle/working/model_epoch_4.pth
Epoch 5, Loss: 17.9407
✅ Model saved at: /kaggle/working/model_epoch_5.pth
Epoch 6, Loss: 10.5987
✅ Model saved at: /kaggle/working/model_epoch_6.pth
Epoch 7, Loss: 16.3906
✅ Model saved at: /kaggle/working/model_epoch_7.pth
Epoch 8, Loss: 13.6853
✅ Model saved at: /kaggle/working/model_epoch_8.pth
Epoch 9, Loss: 3.9187
✅ Model saved at: /kaggle/working/model_epoch_9.pth
Epoch 10, Loss: 10.3845
✅ Model saved at: /kaggle/working/model_epoch_10.pth
Epoch 11, Loss: 1.2564
✅ Model saved at: /kaggle/working/model_epoch_11.pth
Epoch 12, Loss: 0.0550
✅ Model saved at: /kaggle/working/model_epoch_12.pth


In [None]:
from transformers import AutoModel
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub("your_username/whisper-stuttering-classification")


In [19]:
model_path = "/kaggle/working/model_epoch_12.pth"
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f" Model loaded from {model_path}")
else:
    raise FileNotFoundError(f" Model file not found: {model_path}")
model.eval()
criterion = nn.CrossEntropyLoss()

def evaluate(model, test_dataloader):
    correct = 0
    total = 0
    total_loss = 0

    with torch.no_grad():
        for batch in test_dataloader:
            input_features, labels = batch
            input_features, labels = input_features.to(device), labels.to(device)
            logits = model(input_features)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_dataloader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

test_loss, test_accuracy = evaluate(model, test_dataloader)

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

  model.load_state_dict(torch.load(model_path, map_location=device))


 Model loaded from /kaggle/working/model_epoch_12.pth
Test Loss: 1.3588, Test Accuracy: 74.50%


In [37]:
import torch
import torchaudio
from transformers import WhisperFeatureExtractor, WhisperModel
import torch.nn as nn

# Define Model
class WhisperForStutteringClassification(nn.Module):
    def __init__(self, model_name="openai/whisper-small", num_labels=2):
        super().__init__()
        self.whisper = WhisperModel.from_pretrained(model_name)
        self.whisper.encoder.gradient_checkpointing = False
        self.classifier = nn.Linear(self.whisper.config.d_model, num_labels)

    def forward(self, input_features):
        encoder_outputs = self.whisper.encoder(input_features).last_hidden_state
        pooled_output = encoder_outputs.mean(dim=1)
        logits = self.classifier(pooled_output)
        return logits

# Load Model (Fix applied)
model_path = "/kaggle/working/model_epoch_12.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = WhisperForStutteringClassification()
model.load_state_dict(torch.load(model_path, map_location=device))  # Load only weights
model.to(device)
model.eval()
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)
    input_features = feature_extractor(
        waveform.numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features

    return input_features.to(device)

def predict_stuttering(audio_path):
    input_features = preprocess_audio(audio_path)

    with torch.no_grad():
        logits = model(input_features)

    probabilities = torch.softmax(logits, dim=1)
    prediction = torch.argmax(probabilities, dim=1).cpu().numpy()[0]

    return "Stuttering Detected" if prediction == 1 else "No Stuttering"
audio_file = "/kaggle/input/no-stutter/test.wav"
result = predict_stuttering(audio_file)
print(f"Prediction: {result}")


  model.load_state_dict(torch.load(model_path, map_location=device))  # Load only weights


Prediction: No Stuttering
