In [9]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import librosa
import os
from tqdm import tqdm
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from preprocessing.identity import Identity
from feature_extraction.mfcc import MFCC
from models.svm import SVM

original_metadata_path = os.path.join(".", "data", "original_data_labeled.tsv")
filtered_metadata_path = os.path.join(".", "data", "filtered_data_labeled.tsv")
audio_dir = os.path.join(".", "data", "filtered_clips")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Remove records of erroneous data (e.g. missing or corrupted audio files) from the dataset.

In [10]:
if not os.path.exists(filtered_metadata_path):
  df = pd.read_csv(original_metadata_path, sep='\t')

  valid_indices = []
  for idx, row in tqdm(df.iterrows(), total=len(df), desc="Checking files"):
    file_path = os.path.join(audio_dir, row['path'])
    if os.path.exists(file_path):
      valid_indices.append(idx)

  # Use only records with existing files
  df = df.loc[valid_indices]

  # Save the filtered DataFrame to a new TSV file
  df.to_csv(filtered_metadata_path, sep='\t', index=False)

In [11]:
# Load metadata
df = pd.read_csv(filtered_metadata_path, sep='\t')
# Drop unnecessary columns
df.drop(columns=['client_id', 'sentence', 'age', 'gender', 'accent'], inplace=True, errors='ignore')
df.head()

Unnamed: 0,path,up_votes,down_votes,label
0,common_voice_en_19687170.mp3,2,1,3
1,common_voice_en_19687171.mp3,2,1,3
2,common_voice_en_19687172.mp3,2,0,3
3,common_voice_en_19687173.mp3,2,0,3
4,common_voice_en_19687174.mp3,2,0,3


Loading Training data

In [12]:
samples = 2000 # df['label'].value_counts().min()
balanced_samples = pd.DataFrame()
for cls in range(4):
    cls_df = df[df['label'] == cls]
    sampled = cls_df.sample(n=samples, random_state=42)  # Random sampling
    balanced_samples = pd.concat([balanced_samples, sampled])

def load_audio_file(file_path):
    try:
        return librosa.load(file_path)
    except Exception as e:
        return None, None

X, Y = [], []
error = 0
for row in tqdm(balanced_samples.itertuples(), desc="Loading audio files"):
    audio, sr = load_audio_file(os.path.join(audio_dir, row.path))
    if audio is None:
        error += 1
    else:
        X.append(audio)
        Y.append(row.label)

  return librosa.load(file_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Loading audio files: 8000it [01:22, 97.37it/s] 


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

### Pipeline

In [14]:
identity = Identity()
mfcc = MFCC()
svm = SVM()

pipe = make_pipeline(identity, mfcc, svm, verbose=True)
pipe

In [16]:
pipe = pipe.fit(X_train, np.array(y_train))
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[Pipeline] .......... (step 1 of 3) Processing identity, total=   0.0s


Extracting MFCCs: 100%|██████████| 6051/6051 [01:08<00:00, 87.79it/s] 


[Pipeline] .............. (step 2 of 3) Processing mfcc, total= 1.1min
[Pipeline] ............... (step 3 of 3) Processing svm, total=   2.2s


Extracting MFCCs: 100%|██████████| 1513/1513 [00:16<00:00, 90.08it/s]


Accuracy: 0.5677461996034369

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.61      0.57       377
           1       0.60      0.66      0.63       384
           2       0.59      0.50      0.54       374
           3       0.56      0.51      0.53       378

    accuracy                           0.57      1513
   macro avg       0.57      0.57      0.57      1513
weighted avg       0.57      0.57      0.57      1513


Confusion Matrix:
[[229  20  90  38]
 [ 28 253  14  89]
 [138  25 186  25]
 [ 38 126  23 191]]


