In [1]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

### Data Preperation

In [2]:


# Custom Dataset to load the .npy files
class AccentDataset(Dataset):
    def __init__(self, feature_dir):
        self.feature_dir = feature_dir
        self.files = []
        self.labels = []

        # Get all .npy files and their corresponding folder names (targets)
        for folder in os.listdir(feature_dir):
            folder_path = os.path.join(feature_dir, folder)
            if os.path.isdir(folder_path):
                for file in os.listdir(folder_path):
                    if file.endswith('.npy'):
                        self.files.append(os.path.join(folder_path, file))
                        self.labels.append(folder)
        
        # Encode the folder names (categories) to numeric labels
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.labels)
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        file_path = self.files[idx]
        features = np.load(file_path)  # Load the .npy file
        features = torch.tensor(features).float()  # Convert to tensor
        label = torch.tensor(self.labels[idx]).long()  # Get label
        return features, label


### Model Setup (Conformer)

In [3]:
import torch.nn as nn
import torch.optim as optim

In [16]:


class SimpleConformer(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleConformer, self).__init__()
        self.conformer = nn.Sequential(
            nn.Conv1d(768, 32, kernel_size=3, stride=1, padding=1),  # Example Conv layer
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
        )
        self.flattened_size = 64 * 162
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(self.flattened_size, 128),
            nn.ReLU(),
            nn.Linear(128, 3),  # Output layer for 3 categories
        )
    def forward(self, x):
        x = self.conformer(x)
        x = x.view(x.size(0), -1)  # Flatten
        out = self.fc(x)
        return out
# Example usage
model = SimpleConformer(208002,3)
input_tensor = torch.randn(16, 768, 649)  # Batch of 16, 768 channels, sequence length 649
output = model(input_tensor)
print(output.shape) 

def compute_accuracy(preds, labels):
    _, predicted = torch.max(preds, 1)  # Get the index of the max log-probability
    correct = (predicted == labels).sum().item()  # Count correct predictions
    accuracy = correct / labels.size(0)
    return accuracy


torch.Size([16, 3])


### Training the Model

In [18]:
dataset = AccentDataset(feature_dir='trailDataset')
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Define the model, loss function, and optimizer
input_dim = 208002  # Your input dimension (sequence length)
num_classes = 3  # Number of accent categories
model = SimpleConformer(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    for features, labels in dataloader:
        features, labels = features.to(device), labels.to(device)
        
        
        features = features.squeeze(1)
        features = features.transpose(1, 2)  
        
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
            
            # Compute accuracy
        accuracy = compute_accuracy(outputs, labels)
        correct_predictions += accuracy * labels.size(0)
        total_predictions += labels.size(0)
    
    epoch_loss = running_loss / total_predictions
    epoch_accuracy = correct_predictions / total_predictions
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
        
print('Training completed!')


Epoch 1/50, Loss: 0.0689, Accuracy: 0.3603
Epoch 2/50, Loss: 0.0664, Accuracy: 0.4246
Epoch 3/50, Loss: 0.0637, Accuracy: 0.4777
Epoch 4/50, Loss: 0.0590, Accuracy: 0.5393
Epoch 5/50, Loss: 0.0510, Accuracy: 0.6275
Epoch 6/50, Loss: 0.0370, Accuracy: 0.7505
Epoch 7/50, Loss: 0.0231, Accuracy: 0.8626
Epoch 8/50, Loss: 0.0121, Accuracy: 0.9338
Epoch 9/50, Loss: 0.0054, Accuracy: 0.9754
Epoch 10/50, Loss: 0.0029, Accuracy: 0.9885
Epoch 11/50, Loss: 0.0026, Accuracy: 0.9918
Epoch 12/50, Loss: 0.0006, Accuracy: 0.9993
Epoch 13/50, Loss: 0.0001, Accuracy: 1.0000
Epoch 14/50, Loss: 0.0003, Accuracy: 0.9987
Epoch 15/50, Loss: 0.0001, Accuracy: 1.0000
Epoch 16/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 17/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 18/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 19/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 20/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 21/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 22/50, Loss: 0.0000, Accuracy: 1.0000
Epoch 23/50, Loss: 0.0000, Accuracy: 1.00

KeyboardInterrupt: 