In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

# Paths to the audio files (adjust based on your directory)
audio_folder = 'C:/Users/Laptop/Documents/AI/zindi/ewe_1/files/audio_files/'
train_csv_path = 'Train.csv'
test_csv_path = 'Test_1.csv'
submission_csv_path = 'SampleSubmission_1.csv'

In [2]:
# Load CSV files
train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)
sample_submission = pd.read_csv(submission_csv_path)

In [3]:
len(train_data), len(test_data)

(5334, 2946)

In [4]:
# Method 2: Using StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.17, random_state=58)

for train_index, val_index in sss.split(train_data.drop('class', axis=1), train_data['class']):
    train_df, val_df = train_data.iloc[train_index], train_data.iloc[val_index]

In [6]:
# train_df = train_df.reset_index()
# train_df.pop('index')
# # train_df

In [8]:
# val_df = val_df.reset_index()
# val_df.pop('index')
# # val_df

In [9]:
# One-hot encode labels (if necessary) 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels = le.fit_transform(train_df['class'])
train_df['labels'] = train_labels

le_val = LabelEncoder()
val_labels = le_val.fit_transform(val_df['class'])
val_df['labels'] = val_labels

In [11]:
import librosa
import numpy as np
import torch
import torch.nn.functional as F
from scipy.signal import butter, lfilter

# Bandpass filter (removes irrelevant frequency bands)
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def extract_mfcc(file_path, n_mfcc=40, max_len=100):
    """Extract MFCC features from an audio file."""
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

# Extract features (time-domain and frequency-domain)
def extract_features(file_path, n_mels=40, max_length=100):  # audio_file => file_path
    # Load the audio file
    audio, sr = librosa.load(file_path)
    
    # Apply bandpass filter (e.g., remove frequencies outside 20-4000 Hz)
    filtered_audio = bandpass_filter(audio, lowcut=20, highcut=4000, fs=sr)
    
    # Time-domain features
    rms = librosa.feature.rms(y=filtered_audio)  # Root Mean Square energy
    zcr = librosa.feature.zero_crossing_rate(filtered_audio)  # Zero-crossing rate
    
    # Frequency-domain features
    spectral_centroid = librosa.feature.spectral_centroid(y=filtered_audio, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=filtered_audio, sr=sr)
    spectral_flux = np.diff(librosa.feature.spectral_bandwidth(y=filtered_audio, sr=sr), axis=1)
    
    # Time-frequency representation: Mel-spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=filtered_audio, sr=sr, n_mels=n_mels)
    
    # Convert to decibels for better representation
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # mfcc
    mfcc = librosa.feature.mfcc(y=filtered_audio, sr=sr, n_mfcc=n_mels)
   
    # Ensure all features have the same length (pad/truncate)
    def pad_or_truncate(feature, target_length=max_length):
        if feature.shape[1] < target_length:
            padding = target_length - feature.shape[1]
            feature = np.pad(feature, ((0, 0), (0, padding)), 'constant')
        else:
            feature = feature[:, :target_length]
        return feature

    # rms = pad_or_truncate(rms)
    # zcr = pad_or_truncate(zcr)
    # spectral_centroid = pad_or_truncate(spectral_centroid)
    # spectral_bandwidth = pad_or_truncate(spectral_bandwidth)
    # spectral_flux = pad_or_truncate(spectral_flux)
    mel_spectrogram_db = pad_or_truncate(mel_spectrogram_db)
    mfcc = pad_or_truncate(mfcc)

    # print(f'rms: {rms.shape}, zcr: {zcr.shape}, spectral_centroid: {spectral_centroid.shape}')
    # print(f'spectral_bandwidth: {spectral_bandwidth.shape}, spectral_flux: {spectral_flux.shape}')
    # print(f'mel_spectrogram_db: {mel_spectrogram_db.shape}')
    # print(f'mfcc {mfcc.shape}')

    # features = np.vstack([mel_spectrogram_db, mfcc])
    features =  (mel_spectrogram_db + mfcc) / 2
    # print(f'features: {features.shape}')

    return features


    # return mfcc
    # return mel_spectrogram_db
    
    # # Stack features together (optional, but makes use of all features)
    # features = np.vstack([rms, zcr, spectral_centroid, spectral_bandwidth, spectral_flux, mel_spectrogram_db])
    
    # # Convert to PyTorch tensor
    # features_tensor = torch.tensor(features, dtype=torch.float32)
    
    # return features_tensor

# Preprocess multiple audio files
def preprocess_audio_files(audio_file_list, sample_rate=100, n_mels=40, max_length=100):
    feature_tensors = []
    for audio_file in audio_file_list:
        features = extract_features(audio_file, sample_rate, n_mels, max_length)
        feature_tensors.append(features)
    
    # Stack the tensors along a new batch dimension
    batched_features = torch.stack(feature_tensors)

    return batched_features

# ========================================================================================== 

    # # Convert the tensor to NumPy array
    # numpy_array = batched_features.numpy()

    # # Reshape to 2D array where each row is a feature vector
    # reshaped_array = numpy_array.reshape(numpy_array.shape[0], -1)  # (batch_size, num_features * height * width)

    # # Create a DataFrame
    # df = pd.DataFrame(reshaped_array)

    # df['features'] = df.apply(lambda row: row.tolist(), axis=1)


    # return df[['features']]


    # return feature_tensors
    
    return batched_features


In [12]:
# Helper function to extract MFCCs
def extract_mfcc(file_path, n_mfcc=40, max_len=100):
    """Extract MFCC features from an audio file."""
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    
    # Pad or truncate the mfccs to the same length
    if mfccs.shape[1] < max_len:
        pad_width = max_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_len]
    
    return mfccs

In [13]:
# PyTorch Dataset class for loading data
class AudioDataset(Dataset):
    def __init__(self, data, audio_folder, is_train=True, transform=None):
        self.data = data
        self.audio_folder = audio_folder
        self.is_train = is_train
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data.loc[idx, 'audio_filepath']
        file_path = self.audio_folder + file_name
        if self.is_train:
            label = self.data.loc[idx, 'labels']
        else:
            label = -1  # No label for test set

        # Extract features
        features = extract_features(file_path)
        # features = extract_mfcc(file_path)

        if self.transform:
            features = self.transform(features)

        features = torch.tensor(features, dtype=torch.float32)
        return features, label

In [30]:
# Define a simple CNN for classification
class SpeechClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SpeechClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 10 * 25, 128)  # Adjust based on MFCC feature size
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 32 * 10 * 25)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [31]:
# Load the training and test datasets
train_dataset = AudioDataset(train_df, audio_folder, is_train=True)
val_dataset = AudioDataset(val_df, audio_folder, is_train=False)
# test_dataset = AudioDataset(test_data, audio_folder, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [32]:
# Initialize the model, loss function, and optimizer
num_classes = len(train_df['labels'].unique())  # Assuming labels are integers
model = SpeechClassifier(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [33]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for mfccs, labels in train_loader:
        mfccs = mfccs.float()
        labels = labels.long()

        optimizer.zero_grad()
        # print(f'mfccs.shape: {mfccs.shape}')
        outputs = model(mfccs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Predict on the val set
model.eval()
predictions = []

with torch.no_grad():
    for mfccs, _ in val_loader:
        outputs = model(mfccs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())


Epoch [1/10], Loss: 1.6916
Epoch [2/10], Loss: 0.1074
Epoch [3/10], Loss: 0.0401
Epoch [4/10], Loss: 0.0241
Epoch [5/10], Loss: 0.0278
Epoch [6/10], Loss: 0.0235
Epoch [7/10], Loss: 0.0073
Epoch [8/10], Loss: 0.0089
Epoch [9/10], Loss: 0.0111
Epoch [10/10], Loss: 0.0132


In [34]:
new_df = pd.DataFrame(data={'predictions':predictions, 'labels': val_df['labels']})
new_df['accuracy'] = new_df['predictions'] == new_df['labels']
new_df['accuracy'] = new_df['accuracy'].apply(lambda x: int(x))
new_df['accuracy'].sum()/len(new_df)

0.9977949283351709

In [35]:
print(f'Accuracy Score: {accuracy_score(new_df.predictions, new_df.labels)}')
print(f'Mean Squared Error: {mean_squared_error(new_df.predictions, new_df.labels)}')

Accuracy Score: 0.9977949283351709
Mean Squared Error: 0.037486218302094816


In [36]:
len(train_df[train_df['labels'] == 6]), len(train_df[train_df['labels'] == 3])

(540, 540)

In [37]:
new_df[new_df.accuracy == 0]

Unnamed: 0,predictions,labels,accuracy
153,2,7,0
793,3,6,0


In [38]:
len(val_df), len(train_df)

(907, 4427)

In [39]:
# encode the prediction and the value labels
from sklearn.preprocessing import LabelBinarizer

df = pd.DataFrame(new_df.labels)

lb = LabelBinarizer()
encoded = lb.fit_transform(df['labels'])

preds_df = pd.DataFrame(new_df.predictions)
preds_encoded = lb.fit_transform(preds_df['predictions'])

# cross check two data frames for accuracy 

In [40]:
roc_auc = roc_auc_score(encoded, preds_encoded )
roc_auc

0.9987168386979945

In [41]:
# Assuming 'y_pred' and 'y_true' are your predicted and true labels
def calculate_metrics(y_true, y_pred):

    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    # roc_auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo')

    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)

In [42]:
print(f'Metrics: {calculate_metrics(new_df.labels, new_df.predictions)}')

Precision: 0.9977678571428572
Recall: 0.9977477477477478
F1-score: 0.9977477020473591
Confusion Matrix:
 [[113   0   0   0   0   0   0   0]
 [  0 129   0   0   0   0   0   0]
 [  0   0 111   0   0   0   0   0]
 [  0   0   0 111   0   0   0   0]
 [  0   0   0   0 110   0   0   0]
 [  0   0   0   0   0 111   0   0]
 [  0   0   0   1   0   0 110   0]
 [  0   0   1   0   0   0   0 110]]
Metrics: None


In [43]:
# For overfitting, evaluate on the train data 

# Predict on the train set
model.eval()
preds_train = []

with torch.no_grad():
    for mfccs, _ in train_loader:
        outputs = model(mfccs)
        _, predicted = torch.max(outputs, 1)
        preds_train.extend(predicted.cpu().numpy())

In [45]:
accuracy_score(preds_train, train_df['labels'])
# 0.12536706573300202

0.1348543031398238

In [46]:
calculate_metrics(preds_train, train_df['labels'])

Precision: 0.13447782231669847
Recall: 0.13449287711118654
F1-score: 0.13448480095958637
Confusion Matrix:
 [[71 73 61 65 68 68 80 66]
 [72 97 84 78 68 72 86 73]
 [80 81 70 62 58 70 60 61]
 [56 81 64 70 72 65 53 75]
 [60 83 65 70 76 65 66 56]
 [75 65 63 70 67 70 76 58]
 [65 72 65 70 66 66 66 74]
 [73 78 68 55 66 68 53 77]]


In [47]:
test_dataset = AudioDataset(test_data, audio_folder, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [48]:
# Predict on the test set
model.eval()
predictions = []

with torch.no_grad():
    for mfccs, _ in test_loader:
        outputs = model(mfccs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

In [49]:
pred_df = pd.DataFrame(test_data['id']).join(pd.DataFrame(data={'predictions':predictions}))

In [50]:
label_class_map = dict(zip(val_df['labels'], val_df['class']))

In [51]:
pred_df['class'] = pred_df['predictions'].map(label_class_map)
pred_df[['id', 'class']].to_csv('fourteenth_submission.csv', index=False)

In [52]:
# Save the model
torch.save(model, 'models/fourteenth_model.pth')

======================================================================================================

In [None]:
# Experimentation 

In [46]:
val_df[val_df.index.isin([125, 200, 291, 294, 439, 589, 655, 705, 793])]['audio_filepath'].to_list()

['id_zoexdm8ija.wav',
 'id_ty9r27zg3q.wav',
 'id_tgb8l0c0wg.wav',
 'id_xn0fsugajr.wav',
 'id_t0wr768br5.wav',
 'id_side3oquzx.wav',
 'id_g9hjzgqvlq.wav',
 'id_9kd9836zh9.wav',
 'id_usnj1u0zi4.wav']

In [39]:
train_df[train_df.labels == 3]

Unnamed: 0,id,audio_filepath,duration,class,labels
3,id_o3kjklck0a,id_o3kjklck0a.wav,1.494,no,3
14,id_o421uutlu1,id_o421uutlu1.wav,1.298,no,3
19,id_zwbl8n96xc,id_zwbl8n96xc.wav,1.298,no,3
20,id_m1nmzfgqgj,id_m1nmzfgqgj.wav,2.074,no,3
29,id_2l1qkon9ck,id_2l1qkon9ck.wav,2.074,no,3
...,...,...,...,...,...
4383,id_caqol2mn49,id_caqol2mn49.wav,1.494,no,3
4389,id_duyvkvx3wd,id_duyvkvx3wd.wav,2.074,no,3
4396,id_1h5bxl3mc9,id_1h5bxl3mc9.wav,1.494,no,3
4400,id_0v0n87ljbv,id_0v0n87ljbv.wav,2.069,no,3


In [43]:
train_df[train_df.labels == 6]

Unnamed: 0,id,audio_filepath,duration,class,labels
1,id_i8zxrhlz5y,id_i8zxrhlz5y.wav,1.554,up,6
13,id_4pfrsk1p8s,id_4pfrsk1p8s.wav,1.834,up,6
27,id_deks1yg3bu,id_deks1yg3bu.wav,2.347,up,6
31,id_pzx2qxduq5,id_pzx2qxduq5.wav,1.578,up,6
38,id_09v8mt62hp,id_09v8mt62hp.wav,1.554,up,6
...,...,...,...,...,...
4380,id_40tl7wnmzd,id_40tl7wnmzd.wav,1.984,up,6
4391,id_vt6ozjrcat,id_vt6ozjrcat.wav,1.834,up,6
4419,id_a2so0g6x1e,id_a2so0g6x1e.wav,1.078,up,6
4423,id_jalx0hwv6y,id_jalx0hwv6y.wav,2.347,up,6


In [None]:
'id_deks1yg3bu.wav'

In [None]:
['id_zoexdm8ija.wav',
 'id_ty9r27zg3q.wav',
 'id_tgb8l0c0wg.wav',
 'id_xn0fsugajr.wav',
 'id_t0wr768br5.wav',
 'id_side3oquzx.wav',
 'id_g9hjzgqvlq.wav',
 'id_9kd9836zh9.wav',
 'id_usnj1u0zi4.wav']

In [55]:
sub_13 = pd.read_csv('thirteenth_submission.csv')

In [56]:
sub_14 = pd.read_csv('fourteenth_submission.csv')

In [58]:
sub_df = pd.DataFrame(data={'class_14':sub_14['class'], 'class_13': sub_13['class']})
sub_df['accuracy'] = sub_df['class_14'] == sub_df['class_13']
sub_df['accuracy'] = sub_df['accuracy'].apply(lambda x: int(x))
sub_df['accuracy'].sum()/len(sub_df)

0.8757637474541752

In [68]:
sub_df[sub_df.accuracy == 0].head(20)

Unnamed: 0,class_14,class_13,accuracy
0,up,yes,0
4,up,right,0
6,stop,go,0
7,left,yes,0
14,left,go,0
15,left,yes,0
16,stop,yes,0
17,stop,yes,0
20,yes,left,0
21,no,go,0


In [69]:
calculate_metrics(sub_df['class_13'], sub_df['class_14'])

Precision: 0.8835569867437438
Recall: 0.9120965432497925
F1-score: 0.8882471824084415
Confusion Matrix:
 [[325   0   0   0   0   0   0   0]
 [ 10 341  10  10   0  20   0   0]
 [  0   0 279   0   0  10   5   5]
 [  0   0   0 297   0   0  24   0]
 [  0   0  10   0 279   0  10   0]
 [  0   0   0   0   0 295   0   0]
 [  0   0   0   0   0  10 277   0]
 [  0  80  27   0   0 110  25 487]]


In [72]:
audio_folder + 'id_0a9fu87h9r.wav'

'C:/Users/Laptop/Documents/AI/zindi/ewe_1/files/audio_files/'

In [74]:
import wave
import IPython.display as ipd
import os

fname = os.path.join(audio_folder, 'id_0c5hyifav8.wav')   # Applause
# Open using wave library
wav = wave.open(fname)
print("Sampling (frame) rate = ", wav.getframerate())
print("Total samples (frames) = ", wav.getnframes())
print("Duration = ", wav.getnframes()/wav.getframerate())

ipd.Audio(fname)

Sampling (frame) rate =  48000
Total samples (frames) =  128016
Duration =  2.667
