In [57]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import torchaudio
from torchaudio.transforms import Spectrogram
import torch.nn.functional as F

In [58]:
# Paths to your dataset
CONFIDENT_FILE = os.path.join('confident_wav', 'Confident (1).wav')
UNCONFIDENT_FILE = os.path.join('unconfident_wav', 'Unconfident (1).wav')

In [59]:
# Data loading and preprocessing
def load_wav_16k_mono(filename):
    # Load encoded wav file
    waveform, sample_rate = torchaudio.load(filename)
    # Resample to 16 kHz and convert to mono
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    return waveform.mean(dim=0).unsqueeze(0)  # convert to mono and add batch dimension

def preprocess(filename):
    waveform = load_wav_16k_mono(filename)
    spectrogram = Spectrogram(n_fft=320, hop_length=32)(waveform)
    return spectrogram

In [60]:
import torch.nn.functional as F

class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, max_length=1500):
        """
        Args:
            file_paths: List of file paths for the audio files.
            labels: List of corresponding labels (1 or 0).
            max_length: Fixed length for spectrograms. Spectrograms longer than this will be truncated,
                        and shorter ones will be padded.
        """
        self.file_paths = file_paths
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        spectrogram = self.load_spectrogram(self.file_paths[idx])
        label = self.labels[idx]

        # Truncate or pad spectrogram to max_length
        if spectrogram.shape[2] > self.max_length:
            spectrogram = spectrogram[:, :, :self.max_length]  # Truncate
        elif spectrogram.shape[2] < self.max_length:
            padding = self.max_length - spectrogram.shape[2]
            spectrogram = F.pad(spectrogram, (0, padding), mode='constant', value=0)  # Pad

        return spectrogram, label

    def load_spectrogram(self, path):
        """
        Loads the spectrogram from a file.
        """
        waveform, sample_rate = torchaudio.load(path)
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
        spectrogram = Spectrogram(n_fft=320, hop_length=32)(waveform.mean(dim=0).unsqueeze(0))
        return spectrogram


In [61]:
# Load Dataset
confident_dir = os.path.join('confident_wav')
unconfident_dir = os.path.join('unconfident_wav')

confident_files = [os.path.join(confident_dir, file) for file in os.listdir(confident_dir) if file.endswith('.wav')]
unconfident_files = [os.path.join(unconfident_dir, file) for file in os.listdir(unconfident_dir) if file.endswith('.wav')]

file_paths = confident_files + unconfident_files
labels = [1] * len(confident_files) + [0] * len(unconfident_files)

dataset = AudioDataset(file_paths, labels, max_length=1500)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [62]:
class AudioModel(nn.Module):
    def __init__(self):
        super(AudioModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3)
        self.flatten = nn.Flatten()
        self.fc1 = None  # Placeholder for dynamic initialization
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.flatten(x)

        if self.fc1 is None:
            self.fc1 = nn.Linear(x.shape[1], 128).to(x.device)  # Dynamically initialize fc1
        
        x = self.fc1(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

In [63]:
def process_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Convert stereo to mono by averaging channels if there are 2 channels
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Create a Mel spectrogram
    spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)
    spectrogram = spectrogram.unsqueeze(0)  # Add batch dimension
    return spectrogram


In [49]:
# Training with Validation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AudioModel().to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Add weight decay

epochs = 30
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    for spectrogram, labels in train_loader:
        spectrogram, labels = spectrogram.to(device), labels.to(device).float().view(-1, 1)
        
        optimizer.zero_grad()
        outputs = model(spectrogram)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        predicted = (outputs > 0.5).float()
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)
    
    accuracy = (correct_predictions / total_predictions) * 100
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    model.eval()
    with torch.no_grad():
        for spectrogram, labels in val_loader:
            spectrogram, labels = spectrogram.to(device), labels.to(device).float().view(-1, 1)
            outputs = model(spectrogram)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = (outputs > 0.5).float()
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)
    
    val_accuracy = (val_correct / val_total) * 100
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%, Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_accuracy:.2f}%")

torch.save(model.state_dict(), 'D:/audio_model3.pth')
print("Model saved as 'audio_model3.pth'")

Epoch [1/30], Loss: 0.8765, Accuracy: 44.19%, Val Loss: 0.9110, Val Accuracy: 36.36%
Epoch [2/30], Loss: 0.6701, Accuracy: 58.91%, Val Loss: 0.8522, Val Accuracy: 24.24%
Epoch [3/30], Loss: 0.6287, Accuracy: 62.79%, Val Loss: 0.8261, Val Accuracy: 39.39%
Epoch [4/30], Loss: 0.5823, Accuracy: 61.24%, Val Loss: 0.8430, Val Accuracy: 42.42%
Epoch [5/30], Loss: 0.5696, Accuracy: 63.57%, Val Loss: 0.8776, Val Accuracy: 48.48%
Epoch [6/30], Loss: 0.5600, Accuracy: 70.54%, Val Loss: 0.9236, Val Accuracy: 48.48%
Epoch [7/30], Loss: 0.5119, Accuracy: 75.19%, Val Loss: 0.9581, Val Accuracy: 48.48%
Epoch [8/30], Loss: 0.4993, Accuracy: 72.09%, Val Loss: 1.0526, Val Accuracy: 48.48%
Epoch [9/30], Loss: 0.4525, Accuracy: 72.87%, Val Loss: 1.0565, Val Accuracy: 45.45%
Epoch [10/30], Loss: 0.4284, Accuracy: 75.97%, Val Loss: 1.0681, Val Accuracy: 45.45%
Epoch [11/30], Loss: 0.4061, Accuracy: 76.74%, Val Loss: 1.0751, Val Accuracy: 45.45%
Epoch [12/30], Loss: 0.3911, Accuracy: 77.52%, Val Loss: 1.0924

In [56]:
# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AudioModel().to(device)

# Manually adjust the state dictionary
state_dict = torch.load('D:/audio_model3.pth', map_location=device)

# Remove unexpected keys related to fc1
filtered_state_dict = {k: v for k, v in state_dict.items() if k in model.state_dict()}

# Load the filtered state dictionary
model.load_state_dict(filtered_state_dict, strict=False)
model.eval()

# Process and predict
audio_path = 'selinf(2).wav'  # Replace with the path to your audio file
spectrogram = process_audio(audio_path).to(device)

with torch.no_grad():
    output = model(spectrogram)
    prediction = output.item()

if prediction >= 0.5:
    print("Prediction: Confident")
else:
    print("Prediction: Unconfident")


  state_dict = torch.load('D:/audio_model3.pth', map_location=device)


Prediction: Unconfident


In [64]:
import torch
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Step 1: Load the trained model
class MyModel(AudioModel):  # Replace with your model class
    def __init__(self):
        super(MyModel, self).__init__()
        # Define layers here

    def forward(self, x):
        # Define forward pass
        return x

# Load the model architecture and weights
model = MyModel()  # Replace MyModel with your actual model class
model.load_state_dict(torch.load("D:/audio_model3.pth"))
model.eval()  # Set model to evaluation mode

# Step 2: Prepare the test dataset and DataLoader
transform = transforms.Compose([
    transforms.ToTensor(),  # Adjust based on your data preprocessing
    transforms.Normalize((0.5,), (0.5,))  # Example normalization
])

test_dataset = datasets.ImageFolder(root="testr", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 3: Make predictions and collect labels
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, preds = torch.max(outputs, 1)  # Get the class with highest score
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Step 4: Calculate metrics
f1 = f1_score(all_labels, all_preds, average="weighted")  # Adjust average for multi-class
recall = recall_score(all_labels, all_preds, average="weighted")
precision = precision_score(all_labels, all_preds, average="weighted")
conf_matrix = confusion_matrix(all_labels, all_preds)

# Print the metrics
print(f"F1 Score: {f1:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")
print("Confusion Matrix:")
print(conf_matrix)


  model.load_state_dict(torch.load("D:/audio_model3.pth"))


RuntimeError: Error(s) in loading state_dict for MyModel:
	Unexpected key(s) in state_dict: "fc1.weight", "fc1.bias". 