In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import os
import joblib
from tqdm import tqdm
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from preprocessing.noise_reduction import NoiseReducer
from preprocessing.silence_removal import SilenceRemover
from preprocessing.speech_filter import SpeechFilter
from preprocessing.identity import Identity


from audio import Audio

original_metadata_path = os.path.join(".", "data", "original_data_labeled.tsv")
filtered_metadata_path = os.path.join(".", "data", "filtered_data_labeled.tsv")
audio_dir = os.path.join(".", "data", "filtered_clips")

### Pipeline

In [2]:

# folder_path = 'trials/features/all_classes_155'
# X_train_loaded = joblib.load(os.path.join(folder_path, 'X_train.joblib'))
# X_test_loaded = joblib.load(os.path.join(folder_path, 'X_test.joblib'))
# y_train_loaded = joblib.load(os.path.join(folder_path, 'y_train.joblib'))
# y_test_loaded = joblib.load(os.path.join(folder_path, 'y_test.joblib'))

# print(X_train_loaded.shape, X_test_loaded.shape, y_train_loaded.shape, y_test_loaded.shape)

In [3]:
def load_and_concatenate_features(features, main_folder='extracted_features'):
    """
    Load and concatenate features from a single folder.
    
    Args:
        folders (list): List of feature names (used for validation)
        main_folder (str): Main directory containing all features
        
    Returns:
        tuple: (X_train_combined, X_test_combined, y_train, y_test)
    """
    X_train_arrays = []
    X_test_arrays = []
    
    # Load y labels only once since they're the same for all features
    y_train = joblib.load(os.path.join(main_folder, 'y_train.joblib'))
    y_test = joblib.load(os.path.join(main_folder, 'y_test.joblib'))
    
    for feature in features:
        # Construct file names for train and test data
        train_file = f"{feature}_train.joblib"
        test_file = f"{feature}_test.joblib"
        
        # Validate existence of files
        train_path = os.path.join(main_folder, train_file)
        test_path = os.path.join(main_folder, test_file)
        
        if not os.path.exists(train_path) or not os.path.exists(test_path):
            raise ValueError(f"Missing train or test file for feature: {feature}")
        
        # Load train and test data
        X_train = joblib.load(train_path)
        X_test = joblib.load(test_path)
        
        if X_train.shape == np.ravel(X_train).shape:
            X_train = X_train.reshape(X_train.shape[0], -1)
            X_test = X_test.reshape(X_test.shape[0], -1)
            print(f"Reshaped X_train for {feature}: {X_train.shape}")
        X_train_arrays.append(X_train)
        X_test_arrays.append(X_test)
    
    # Concatenate along feature axis (axis=1)
    X_train_combined = np.concatenate(X_train_arrays, axis=1)
    X_test_combined = np.concatenate(X_test_arrays, axis=1)
    
    return X_train_combined, X_test_combined, y_train, y_test

In [4]:
x_train_combined, x_test_combined, y_train, y_test = load_and_concatenate_features(['mfcc150','hfcc150','cpps','fO_1','jitter','pitch_range_1','pitch_mean_min_max_freq_3','alpha_ratio1','spectral5'],
main_folder='trials/features/all_classes_7_ft_splitted')


print(f"x_train_combined shape: {x_train_combined.shape}")
print(f"x_test_combined shape: {x_test_combined.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

Reshaped X_train for alpha_ratio1: (145504, 1)
x_train_combined shape: (145504, 313)
x_test_combined shape: (36376, 313)
y_train shape: (145504,)
y_test shape: (36376,)


In [5]:
X_train_loaded = x_train_combined
X_test_loaded = x_test_combined
y_train_loaded = y_train % 2
y_test_loaded = y_test % 2
print(X_train_loaded.shape, X_test_loaded.shape, y_train_loaded.shape, y_test_loaded.shape)

(145504, 313) (36376, 313) (145504,) (36376,)


In [6]:
from itertools import combinations
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

def find_best_feature_combination(X, y, feature_indices):
    """
    Find the best feature combination for classification, including non-consecutive combinations, and store all accuracies.

    Args:
        X (numpy.ndarray): Feature vectors.
        y (numpy.ndarray): Labels.
        feature_indices (list of tuples): List of (start, end) indices for each feature vector.

    Returns:
        dict: Best combination, accuracy, feature indices, and all accuracies.
    """
    best_accuracy = 0
    best_combination = None
    best_features = None
    all_accuracies = []

    # Generate all possible feature combinations
    print("Generating all possible feature combinations...")
    print(f"Feature indices: {feature_indices}")
    total_combinations = sum(len(list(combinations(feature_indices, r))) for r in range(1, len(feature_indices) + 1))
    print(f"Total combinations to evaluate: {total_combinations}")
    with tqdm(total=total_combinations, desc="Evaluating feature combinations") as pbar:
        for r in range(1, len(feature_indices) + 1):
            for combination in combinations(feature_indices, r):
                # Extract features for the current combination
                selected_features = []
                for start, end in combination:
                    selected_features.append(X[:, start:end])
                X_combined = np.hstack(selected_features)

                # Split data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

                # Train a classifier
                clf = ExtraTreesClassifier(random_state=42)
                clf.fit(X_train, y_train)

                # Evaluate accuracy
                y_pred = clf.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)

                # Store the accuracy and combination
                all_accuracies.append({"combination": combination, "accuracy": accuracy})

                # Update the best combination if current accuracy is higher
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_combination = combination
                    best_features = X_combined

                pbar.update(1)

    return {
        "best_combination": best_combination,
        "best_accuracy": best_accuracy,
        "best_features": best_features,
        "all_accuracies": all_accuracies
    }

feature_indices = [(0, 150), (150, 300), (300, 301), (301, 302), (302, 303), (303, 304), (304, 307), (307,308), (308,313)]  # Example feature vector indices
mapping = {
    "mfcc150": (0, 150),
    "hfcc150": (150, 300),
    "cpps": (300, 301),
    "fO": (301, 302),
    "jitter": (302, 303),
    "pitch_range_1": (303, 304),
    "mean_min_max_freq": (304, 307),
    "alpha_ratio1": (307, 308),
    "spectral5": (308, 313)
}
result = find_best_feature_combination(X_train_loaded, y_train_loaded, feature_indices)
print("Best Combination:", result["best_combination"])
print("Best Accuracy:", result["best_accuracy"])
print("All Accuracies:", result["all_accuracies"])


Generating all possible feature combinations...
Feature indices: [(0, 150), (150, 300), (300, 301), (301, 302), (302, 303), (303, 304), (304, 307), (307, 308), (308, 313)]
Total combinations to evaluate: 511


Evaluating feature combinations: 100%|██████████| 511/511 [5:33:25<00:00, 39.15s/it]  

Best Combination: ((0, 150), (150, 300), (302, 303), (304, 307), (308, 313))
Best Accuracy: 0.9588330297927906
All Accuracies: [{'combination': ((0, 150),), 'accuracy': 0.9469090409264287}, {'combination': ((150, 300),), 'accuracy': 0.9341947012130167}, {'combination': ((300, 301),), 'accuracy': 0.6683619119617883}, {'combination': ((301, 302),), 'accuracy': 0.684581285866465}, {'combination': ((302, 303),), 'accuracy': 0.7067798357444761}, {'combination': ((303, 304),), 'accuracy': 0.717569842960723}, {'combination': ((304, 307),), 'accuracy': 0.8091130888972887}, {'combination': ((307, 308),), 'accuracy': 0.6713171368681489}, {'combination': ((308, 313),), 'accuracy': 0.7934435242775162}, {'combination': ((0, 150), (150, 300)), 'accuracy': 0.957595958901756}, {'combination': ((0, 150), (300, 301)), 'accuracy': 0.9467715886052026}, {'combination': ((0, 150), (301, 302)), 'accuracy': 0.9466341362839765}, {'combination': ((0, 150), (302, 303)), 'accuracy': 0.9464279578021374}, {'combina




In [None]:
# Sort all accuracies in descending order
sorted_accuracies = sorted(result["all_accuracies"], key=lambda x: x["accuracy"], reverse=True)

# Write the sorted accuracies to a text file in tabular form with improved formatting
with open("statistics/sorted_accuracies_gender_only2.txt", "w") as f:
    # Write the header with borders
    f.write("|" + "=" * 100 + "|" + "=" * 12 + "|\n")
    f.write(f"| {'Combination':<98} | {'Accuracy':<10} |\n")
    f.write("|" + "=" * 100 + "|" + "=" * 12 + "|\n")
    
    # Write each combination and its accuracy with borders
    for entry in sorted_accuracies:
        combination_str = ", ".join([name for name, indices in mapping.items() if indices in entry['combination']])
        f.write(f"| {combination_str:<98} | {entry['accuracy']:<10.6f} |\n")
    f.write("|" + "=" * 100 + "|" + "=" * 12 + "|\n")

# Display the sorted combinations with their accuracies
for entry in sorted_accuracies:
    combination_str = ", ".join([name for name, indices in mapping.items() if indices in entry['combination']])
    print(f"Combination: {combination_str}, Accuracy: {entry['accuracy']}")

Combination: mfcc150, hfcc150, jitter, mean_min_max_freq, spectral5, Accuracy: 0.9588330297927906
Combination: mfcc150, hfcc150, fO, pitch_range_1, mean_min_max_freq, alpha_ratio1, spectral5, Accuracy: 0.9588330297927906
Combination: mfcc150, hfcc150, cpps, jitter, spectral5, Accuracy: 0.9587643036321776
Combination: mfcc150, hfcc150, jitter, pitch_range_1, mean_min_max_freq, spectral5, Accuracy: 0.9587643036321776
Combination: mfcc150, hfcc150, cpps, fO, pitch_range_1, alpha_ratio1, Accuracy: 0.9586955774715645
Combination: mfcc150, hfcc150, cpps, fO, jitter, pitch_range_1, mean_min_max_freq, Accuracy: 0.9586612143912581
Combination: mfcc150, hfcc150, cpps, pitch_range_1, alpha_ratio1, Accuracy: 0.9586268513109515
Combination: mfcc150, hfcc150, cpps, jitter, pitch_range_1, mean_min_max_freq, alpha_ratio1, Accuracy: 0.9586268513109515
Combination: mfcc150, hfcc150, cpps, pitch_range_1, mean_min_max_freq, alpha_ratio1, spectral5, Accuracy: 0.9586268513109515
Combination: mfcc150, hfcc15

In [8]:
from playsound import playsound

def play_sound(file_path):
    """
    Play a sound file.

    Args:
        file_path (str): Path to the sound file.
    """
    try:
        playsound(file_path)
        playsound(file_path)
    except Exception as e:
        print(f"Error playing sound: {e}")

play_sound("../sound/done.mp3")