In [21]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

# Load the processor and model for Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to pad the mel-spectrogram features to a fixed length
def pad_mel_features(input_features, target_length=3000):
    current_length = input_features.shape[-1]
    # Pad if shorter than the target length
    if current_length < target_length:
        padding = torch.zeros((input_features.shape[0], input_features.shape[1], target_length - current_length), device=input_features.device)  # Ensure padding is on the same device
        padded_features = torch.cat((input_features, padding), dim=-1)
    else:
        padded_features = input_features
    return padded_features

# Function to preprocess the audio files and extract features
def preprocess_function(examples):
    audio = examples['audio']
    
    if isinstance(audio, dict) and 'array' in audio:
        audio_array = audio['array']
    else:
        audio_array = audio
    
    # Extract input features from audio using the processor
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Store input features as numpy for compatibility with the dataset
    examples["input_features"] = inputs.input_features.squeeze(0).cpu().numpy()
    
    return examples

# Apply the preprocessing to both the training and validation datasets
train_dataset = train_dataset.map(preprocess_function)
val_dataset = val_dataset.map(preprocess_function)

# Function to transcribe the dataset
def transcribe_audio(dataset):
    def generate_transcriptions(batch):
        # Convert input features back to torch tensor and move to GPU if available
        input_features = torch.tensor(batch["input_features"], device=device)  # Ensure it's 3D: [1, num_features, sequence_length]
        
        # Check if input_features needs unsqueeze
        if input_features.dim() == 2:  # If it has only 2 dimensions, add a batch dimension
            input_features = input_features.unsqueeze(0)
        
        # Pad the input features
        input_features = pad_mel_features(input_features)
        
        # Generate the transcription
        generated_ids = model.generate(input_features)
        
        # Decode the transcription and store in the batch
        batch["transcription"] = processor.batch_decode(generated_ids, skip_special_tokens=True)
        return batch
    
    return dataset.map(generate_transcriptions)

# Transcribe both the train and validation datasets
train_dataset = transcribe_audio(train_dataset)
val_dataset = transcribe_audio(val_dataset)

# Example: Print the transcription of the first sample
print(train_dataset["transcription"][0])



Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

[" Okay, and there's the picture. Just go ahead and tell you. You just say it. The mother is drying a plate. The water's sink is clogged, so there's more water going over it, and it's splashing onto the floor. And the boy's falling off the stool getting cookies from the cookie jar."]


In [35]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW, DataCollatorWithPadding
from tqdm import tqdm
import torch

# Load a text classification model (e.g., BERT or DistilBERT)
text_model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(text_model_name)
text_model = AutoModelForSequenceClassification.from_pretrained(text_model_name, num_labels=2)

# Check the structure of your datasets
def print_dataset_info(dataset):
    print("Dataset columns:", dataset.column_names)
    print("Sample data:", dataset[0])

print("Training Dataset Info:")
print_dataset_info(train_dataset)
print("\nValidation Dataset Info:")
print_dataset_info(val_dataset)

def tokenize_function(examples):
    # Ensure transcription column is present
    if "transcription" not in examples:
        print("No transcription column found in examples.")
        return {}

    # Debug print to check the structure of the transcriptions
    print("Original transcriptions:", examples["transcription"][:5])  # Display first few transcriptions

    # Check if transcriptions are nested
    if isinstance(examples["transcription"][0], list):
        # Flatten the transcriptions if they are nested lists
        flattened_transcriptions = [item for sublist in examples["transcription"] for item in sublist]
    else:
        # Ensure that transcriptions are a flat list
        flattened_transcriptions = examples["transcription"]

    # Tokenization with padding and truncation
    tokenized_output = tokenizer(
        flattened_transcriptions,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Debug print to check the output structure
    print("Tokenized output keys:", tokenized_output.keys())
    
    return tokenized_output


# Apply the tokenization to both train and validation datasets
if 'transcription' in train_dataset.column_names:
    train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["transcription"])
else:
    print("Transcription column not found in train dataset.")

if 'transcription' in val_dataset.column_names:
    val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["transcription"])
else:
    print("Transcription column not found in validation dataset.")

# Convert labels to tensors and set format for PyTorch
train_dataset = train_dataset.with_format("torch")
val_dataset = val_dataset.with_format("torch")

# Define a data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the data loaders with the data collator
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=data_collator)

# Move the text classification model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text_model.to(device)

# Training and evaluation functions
def train_model(text_model, train_loader, optimizer, loss_fn, device, num_epochs=1):
    text_model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        print(f"Epoch {epoch + 1}/{num_epochs}")

        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()

            outputs = text_model(input_ids, attention_mask=attention_mask).logits
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted_labels = torch.max(outputs, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = correct_predictions / total_predictions
        print(f"Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

def evaluate_model(text_model, val_loader, loss_fn, device):
    text_model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = text_model(input_ids, attention_mask=attention_mask).logits
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            _, predicted_labels = torch.max(outputs, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_predictions
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Train and evaluate the text classification model
optimizer = AdamW(text_model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

train_model(text_model, train_loader, optimizer, loss_fn, device, num_epochs=10)
evaluate_model(text_model, val_loader, loss_fn, device)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Dataset Info:
Dataset columns: ['file_name', 'label', 'audio', 'input_features', 'input_ids', 'attention_mask']
Sample data: {'file_name': 'ADReSSo21/diagnosis/train/audio/cn\\adrso157.wav', 'label': tensor(0), 'audio': tensor([0.0003, 0.0004, 0.0004,  ..., 0.0012, 0.0014, 0.0000]), 'input_features': tensor([[-0.1475, -0.4895, -0.5567,  ...,  0.0348, -0.0207,  0.0439],
        [-0.0500, -0.3919, -0.4596,  ...,  0.1323,  0.0769,  0.1415],
        [-0.1546, -0.5026, -0.5567,  ...,  0.1928,  0.1851,  0.1927],
        ...,
        [-0.5567, -0.5567, -0.5567,  ..., -0.5567, -0.5567, -0.5509],
        [-0.5567, -0.5567, -0.5567,  ..., -0.5567, -0.5567, -0.5567],
        [-0.5567, -0.5567, -0.5567,  ..., -0.5567, -0.5567, -0.5567]]), 'input_ids': tensor([  101,  3100,  1010,  1998,  2045,  1005,  1055,  1996,  3861,  1012,
         2074,  2175,  3805,  1998,  2425,  2017,  1012,  2017,  2074,  2360,
         2009,  1012,  1996,  2388,  2003, 17462,  1037,  5127,  1012,  1996,
       

  0%|          | 0/17 [00:00<?, ?it/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`file_name` in this case) have excessive nesting (inputs type `list` where type `int` is expected).