In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from preprocessing.noise_reduction import NoiseReducer
from preprocessing.silence_removal import SilenceRemover
from feature_extraction.mfcc import MFCC
from feature_extraction.fundamental_frequency import FundamentalFrequency
from feature_extraction.jitter import Jitter
from feature_extraction.hfcc import HFCC
from models.svm import SVM
from audio import Audio

original_metadata_path = os.path.join(".", "data", "original_data_labeled.tsv")
filtered_metadata_path = os.path.join(".", "data", "filtered_data_labeled.tsv")
audio_dir = os.path.join(".", "data", "filtered_clips")

### Clean and Load Dataset

Remove records of erroneous data (e.g. missing or corrupted audio files) from the dataset.

In [None]:
if not os.path.exists(filtered_metadata_path):
  df = pd.read_csv(original_metadata_path, sep='\t')

  # Remove unnecessary columns
  df.drop(columns=['client_id', 'sentence', 'age', 'gender', 'accent'], inplace=True, errors='ignore')

  # Filter out rows with missing labels
  df = df[df['label'].notna()]

  with open('error_file_paths.txt', 'r') as f:
      error_file_paths = f.read().splitlines()

  # Convert file paths to just the filename portion
  error_file_paths = [os.path.basename(path) for path in error_file_paths]

  # Remove rows with file paths in the error_file_paths list
  df.drop(df[df['path'].isin(error_file_paths)].index, inplace=True)

  valid_indices = []
  for idx, row in tqdm(df.iterrows(), total=len(df), desc="Checking files"):
    file_path = os.path.join(audio_dir, row['path'])
    if os.path.exists(file_path):
      valid_indices.append(idx)

  # Use only records with existing files
  df = df.loc[valid_indices]

  # Save the filtered DataFrame to a new TSV file
  df.to_csv(filtered_metadata_path, sep='\t', index=False)
else:
  df = pd.read_csv(filtered_metadata_path, sep='\t')

### Visualizations

In [None]:
# Get sorted unique labels
labels = sorted(df['label'].unique())

# Define bin edges so bars are centered on labels
bin_edges = np.arange(min(labels) - 0.5, max(labels) + 1.5, 1)

plt.hist(df['label'], bins=bin_edges, rwidth=0.8)
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.title('Distribution of Labels')
plt.xticks(labels)  # Set ticks to the actual labels
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Create a 2D histogram (heatmap) of upvotes vs downvotes
plt.figure(figsize=(8, 6))
plt.hist2d(df['up_votes'], df['down_votes'], bins=(100, 100), range=[(0, 100), (0, 100)], cmap='viridis', cmin=1, cmax=2000)
plt.colorbar(label='Frequency')

plt.title("2D Histogram of Upvotes vs Downvotes")
plt.xlabel("Upvotes")
plt.ylabel("Downvotes")
plt.tight_layout()

plt.show()

Loading Training data

In [None]:
samples = 100 #df['label'].value_counts().min()
balanced_samples = pd.DataFrame()
for cls in df['label'].unique():
    cls_df = df[df['label'] == cls]
    sampled = cls_df.sample(n=samples, random_state=42)  # Random sampling
    balanced_samples = pd.concat([balanced_samples, sampled])

### Pipeline

In [None]:
noise_reducer = NoiseReducer()
silence_remover = SilenceRemover()
fundamental_freq = FundamentalFrequency()
jitter = Jitter()
hfcc = HFCC()
mfcc = MFCC()
svm = SVM()

feature_union = FeatureUnion([
    ('mfcc', mfcc),
    ('fundamental_freq', fundamental_freq),
    ('jitter', jitter),
    ('hfcc', hfcc)
])

In [None]:
feature_pipe = make_pipeline(noise_reducer, silence_remover, feature_union, verbose=True)
feature_pipe

def process_in_chunks(df, pipeline, chunk_size=100):
    all_features = []
    all_labels = []

    for i in tqdm(range(0, len(df), chunk_size), desc="Processing chunks"):
        chunk = df.iloc[i:i+chunk_size]
        filenames = chunk['path'].tolist()
        audios = []
        for filename in filenames:
            audio, sr = librosa.load(os.path.join(audio_dir, filename), sr=None)
            audios.append(Audio(audio, sr))
        labels = chunk['label'].values

        features = pipeline.transform(audios)
        all_features.append(np.array(features))
        all_labels.append(labels)

    X = np.concatenate(all_features, axis=0)
    y = np.concatenate(all_labels, axis=0)
    return X, y

df_train, df_test = train_test_split(balanced_samples, test_size=0.2, random_state=42)
X_train, y_train = process_in_chunks(df_train, feature_pipe)
X_test, y_test = process_in_chunks(df_test, feature_pipe)

In [None]:
model_pipe = make_pipeline(svm, verbose=True)
model_pipe = model_pipe.fit(X_train, np.array(y_train))
y_pred = model_pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from joblib import dump
import time
import os
import json

# Create a folder for the model
timestamp = time.strftime("%d_%m_%Y_T%H_%M_%S")
model_folder = f'trials/model_{timestamp}'
os.makedirs(model_folder, exist_ok=True)

# Save the model
model_path = os.path.join(model_folder, 'model.joblib')
dump(pipe, model_path)

# Save evaluation metrics
evaluation = {
  "architecture": str([step[0] for step in pipe.steps]),
  "samples_per_class": samples,
  "accuracy": float(accuracy_score(y_test, y_pred)),
  "classification_report": classification_report(y_test, y_pred, output_dict=True),
  "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
}

# Save evaluation as JSON
eval_path = os.path.join(model_folder, 'evaluation.json')
with open(eval_path, 'w') as f:
  json.dump(evaluation, f, indent=4)

print(f"Model and evaluation saved in folder: {model_folder}")