In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [2]:
feature_names = pd.read_csv(r"MLPC data/idx_to_feature_name.csv"); feature_names

Unnamed: 0,index,feature_name
0,0,bandwidth_0
1,1,centroid_0
2,2,contrast_0
3,3,contrast_1
4,4,contrast_2
...,...,...
170,170,mfcc_d2_30
171,171,mfcc_d2_31
172,172,power_0
173,173,yin_0


In [3]:
# Load the .npy file
features = np.load('MLPC data/development.npy') # Shape of the data: (45296, 175, 44)
# Print the shape of the array to understand its dimensions
print("Shape of the data:", features.shape)

Shape of the data: (45296, 175, 44)


In [4]:
# Load development metadata
metadata = pd.read_csv('MLPC data/development.csv')
features_name = pd.read_csv('MLPC data/idx_to_feature_name.csv')

# Display data to understand structure
print("Development Data Preview:")
print(metadata.head())
print("\nFeature Names Preview:")
print(features_name.head())

Development Data Preview:
   id                 filename  speaker_id         word
0   0     words/Brötchen/1.wav           1     Brötchen
1   1         words/kann/1.wav           1         kann
2   2  words/Staubsauger/1.wav           1  Staubsauger
3   3      words/Spiegel/1.wav           1      Spiegel
4   4        words/Alarm/1.wav           1        Alarm

Feature Names Preview:
   index feature_name
0      0  bandwidth_0
1      1   centroid_0
2      2   contrast_0
3      3   contrast_1
4      4   contrast_2


## Data Preprocessing
### Merging the data together

In [5]:
# Check that the number of entries in metadata matches the number of feature sets
assert len(metadata) == features.shape[0], "Mismatch in number of metadata entries and feature sets"

# Flatten the feature data (from 3D to 2D)
n_samples, n_timesteps, n_features = features.shape
features_flattened = features.reshape(n_samples, n_timesteps * n_features)

# Adding an identifier column in metadata for easy reference
metadata['snippet_id'] = range(len(metadata))

# Create DataFrame from features for easier manipulation
features_df = pd.DataFrame(features_flattened)
features_df['snippet_id'] = metadata['snippet_id']

# Merge features with metadata
full_data = pd.merge(features_df, metadata, on='snippet_id')

In [6]:
full_data.shape # 45,296 samples, with each sample having 7,704 features plus an additional column for labels

(45296, 7705)

In [7]:
full_data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7695,7696,7697,7698,7699,snippet_id,id,filename,speaker_id,word
0,3215.288398,3390.327986,3458.654191,3566.31469,3798.595277,3560.958263,3854.339049,3768.389186,3319.838604,1098.674297,...,0.3275,0.415,0.3225,0.2825,0.3225,0,0,words/Brötchen/1.wav,1,Brötchen
1,3382.723848,3284.346188,3306.78888,3447.606124,3405.036954,3156.256496,3408.131222,3189.25104,3448.428437,3276.407036,...,0.18,0.125,0.0775,0.11,0.1275,1,1,words/kann/1.wav,1,kann
2,2840.292802,2731.571008,2583.823511,2653.939408,2758.208824,2986.179947,3101.84108,3176.543972,3016.110035,2623.352952,...,0.045,0.0725,0.06,0.0325,0.0275,2,2,words/Staubsauger/1.wav,1,Staubsauger


### Splitting the Data

In [8]:
# Prepare feature matrix X and labels y
X = full_data.drop(columns=['word', 'filename', 'speaker_id', 'snippet_id', 'id'])
y = full_data['word']

### Data Leakage
To prevent data leakage we want to insure that when we spllit our data no information about speakers appear in both training and testing datasets.

In [9]:
# Split unique speaker IDs into training and testing groups
train_speakers, test_speakers = train_test_split(metadata['speaker_id'].unique(), test_size=0.2, random_state=42)

# Create masks for selecting rows based on speaker IDs
train_mask = full_data['speaker_id'].isin(train_speakers)
test_mask = full_data['speaker_id'].isin(test_speakers)

# Apply masks to split X and y
X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

In [10]:
# Use RandomForest to determine feature importance
"""This is time-consuming bc it samples the whole data, instead we will sample 25%
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Select features based on importance
selector = SelectFromModel(rf, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)"""

'This is time-consuming bc it samples the whole data, instead we will sample 25%\nrf = RandomForestClassifier(n_estimators=100, random_state=42)\nrf.fit(X_train, y_train)\n\n# Select features based on importance\nselector = SelectFromModel(rf, prefit=True)\nX_train_selected = selector.transform(X_train)\nX_test_selected = selector.transform(X_test)'

In [11]:
# Randomly sample 25% of training data
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.75, random_state=42)

# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Fit RandomForest on the sampled training data
rf.fit(X_train_sample, y_train_sample)

In [12]:
# Use SelectFromModel to choose significant features based on the Random Forest trained on the sampled data
selector = SelectFromModel(rf, prefit=True) # model
# Transform both the complete training and test datasets using the selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

### Continue with Model Trainig
Now that we have reduced the feature set, we can train the final model on these selected features to verify the improvements and continue with further validations or hyperparameter tuning

In [13]:
# Training a new RandomForest with selected features to check performance
rf_final = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_final.fit(X_train_selected, y_train)

# Evaluate model on the test set with selected features
y_pred = rf_final.predict(X_test_selected)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with selected features:", accuracy)

Accuracy with selected features: 0.8584137191854234


## Model Training: Neural Network

In [14]:
# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [15]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(1916, 512)  # Ensure the input feature size matches
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 128)
        self.output_layer = nn.Linear(128, 21)

    def forward(self, x):
        print("Entering Layer 1")
        x = torch.relu(self.layer1(x))
        print("Exiting Layer 1, Entering Layer 2")
        x = torch.relu(self.layer2(x))
        print("Exiting Layer 2, Entering Layer 3")
        x = torch.relu(self.layer3(x))
        print("Exiting Layer 3, Entering Output Layer")
        x = self.output_layer(x)
        print("Exiting Output Layer")
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
model = NeuralNet().to(device)
model = NeuralNet().to(device) # error bc of ['CUDA_LAUNCH_BLOCKING'] = "1" don't use it!!!

#### Label Encoding
To implment the model, we need ot convert our categorical labels (like Licht) into integers.

In [17]:
"""
unique_labels_before = np.unique(y_train)
unique_labels_after = np.unique(y_train_encoded)

print("Unique labels before encoding:", unique_labels_before)
print("Unique labels after encoding:", unique_labels_after)"""

'\nunique_labels_before = np.unique(y_train)\nunique_labels_after = np.unique(y_train_encoded)\n\nprint("Unique labels before encoding:", unique_labels_before)\nprint("Unique labels after encoding:", unique_labels_after)'

In [18]:
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, DataLoader

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform labels to integers
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)  # Only transform on the test set

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_selected.astype(np.float32))
y_train_tensor = torch.tensor(y_train_encoded.astype(np.int64))  # Convert encoded labels to tensor
X_test_tensor = torch.tensor(X_test_selected.astype(np.float32))
y_test_tensor = torch.tensor(y_test_encoded.astype(np.int64))  # Convert encoded labels to tensor

# Create tensor datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [19]:
#print('Min label:', min(y_train_encoded))
#print('Max label:', max(y_train_encoded))

In [20]:
#print("Train tensor shape:", X_train_tensor.shape)
#print("Train labels shape:", y_train_tensor.shape)

#print("Data type:", X_train_tensor.dtype)
#print("Label type:", y_train_tensor.dtype)

#### Train and Evaluate CNN

In [21]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        try:
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        except RuntimeError as e:
            print(f"Runtime error: {e}")
            print(f"Labels: {labels}")
            print(f"Predictions: {outputs}")
            break  # Break out of the loop to fix issues

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the test images: {} %'.format(100 * correct / total))

Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exiting Output Layer
Entering Layer 1
Exiting Layer 1, Entering Layer 2
Exiting Layer 2, Entering Layer 3
Exiting Layer 3, Entering Output Layer
Exitin

In [24]:
from sklearn.neighbors import KNeighborsClassifier

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_selected, y_train)
knn_accuracy = accuracy_score(y_test, knn.predict(X_test_selected))

print(f"KNN Accuracy: {knn_accuracy}")

found 0 physical cores < 1
  File "C:\Users\plani\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


KNN Accuracy: 0.5227224008574491
