In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import librosa
import os
from tqdm import tqdm
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from preprocessing import silence_removal, noise_reduction, volume_normalization, quality_enhancer
from feature_extraction.mfcc import MFCC
from models.svm import SVM
from audio import Audio

original_metadata_path = os.path.join(".", "data", "original_data_labeled.tsv")
filtered_metadata_path = os.path.join(".", "data", "filtered_data_labeled.tsv")
audio_dir = os.path.join(".", "data", "filtered_clips")

Remove records of erroneous data (e.g. missing or corrupted audio files) from the dataset.

In [None]:
if not os.path.exists(filtered_metadata_path):
  df = pd.read_csv(original_metadata_path, sep='\t')

  valid_indices = []
  for idx, row in tqdm(df.iterrows(), total=len(df), desc="Checking files"):
    file_path = os.path.join(audio_dir, row['path'])
    if os.path.exists(file_path):
      valid_indices.append(idx)

  # Use only records with existing files
  df = df.loc[valid_indices]

  # Save the filtered DataFrame to a new TSV file
  df.to_csv(filtered_metadata_path, sep='\t', index=False)

In [None]:
# Load metadata
df = pd.read_csv(filtered_metadata_path, sep='\t')
# Drop unnecessary columns
df.drop(columns=['client_id', 'sentence', 'age', 'gender', 'accent'], inplace=True, errors='ignore')
df.head()

In [None]:
import matplotlib.pyplot as plt

# Create a 2D histogram (heatmap) of upvotes vs downvotes
plt.figure(figsize=(8, 6))
plt.hist2d(df['up_votes'], df['down_votes'], bins=(10, 10), range=[(0, 3), (0, 3)], cmap='viridis')
plt.colorbar(label='Frequency')

plt.title("2D Histogram of Upvotes vs Downvotes")
plt.xlabel("Upvotes")
plt.ylabel("Downvotes")
plt.tight_layout()

plt.show()


Loading Training data

In [None]:
samples = 2000 # df['label'].value_counts().min()
balanced_samples = pd.DataFrame()
for cls in range(4):
    cls_df = df[df['label'] == cls]
    sampled = cls_df.sample(n=samples, random_state=42)  # Random sampling
    balanced_samples = pd.concat([balanced_samples, sampled])

def load_audio_file(file_path):
    try:
        return librosa.load(file_path)
    except Exception as e:
        return None, None

X, Y = [], []
error = 0
for row in tqdm(balanced_samples.itertuples(), desc="Loading audio files"):
    audio, sr = load_audio_file(os.path.join(audio_dir, row.path))

    if audio is None:
        error += 1
    else:
        x = Audio(audio, sr)
        X.append(x)
        Y.append(row.label)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

### Pipeline

In [None]:
noise_reducer = noise_reduction.NoiseReducer()
silence_remover = silence_removal.SilenceRemover()
mfcc = MFCC()
svm = SVM()

pipe = make_pipeline(noise_reducer, silence_remover, mfcc, svm, verbose=True) 
pipe

In [None]:
pipe = pipe.fit(X_train, np.array(y_train))
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
pipe

In [None]:
from joblib import dump
import time
import os
import json

# Create a folder for the model
model_folder = f'trials/model_{int(time.time())}'
os.makedirs(model_folder, exist_ok=True)

# Save the model
model_path = os.path.join(model_folder, 'model.joblib')
dump(pipe, model_path)

# Save evaluation metrics
evaluation = {
  "accuracy": float(accuracy_score(y_test, y_pred)),
  "classification_report": classification_report(y_test, y_pred, output_dict=True),
  "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
}

# Save evaluation as JSON
eval_path = os.path.join(model_folder, 'evaluation.json')
with open(eval_path, 'w') as f:
  json.dump(evaluation, f, indent=4)

print(f"Model and evaluation saved in folder: {model_folder}")

In [None]:
from joblib import load

loaded_pipeline = load('trials/model_1744398148.094835.joblib')