In [12]:
import os
import numpy as np
import pandas as pd
from pydub import AudioSegment
import noisereduce as nr
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
import time
from joblib import dump, load

# Step 1: Convert MP3 to WAV
def mp3_to_wav(input_path, output_path):
    audio = AudioSegment.from_mp3(input_path)
    audio.export(output_path, format="wav")

# Step 2: Preprocessing with Silence Removal
def preprocess_audio(file_path, sr=48000):
    # Load audio
    y, sr = librosa.load(file_path, sr=sr)
    
    # Remove silence
    reduced_noise = nr.reduce_noise(y=y, sr=sr, stationary=True)
    
    # Trim silent parts
    y_trimmed, _ = librosa.effects.trim(reduced_noise, top_db=20)
    
    return y_trimmed, sr

# Step 3: Feature Extraction (46 features)
def extract_features(y, sr=48000):
    features = []
    
    # MFCCs (40 coefficients)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    features.extend(np.mean(mfccs, axis=1))
    
    # Spectral Features
    S = np.abs(librosa.stft(y))
    
    # Spectral Centroid
    centroid = librosa.feature.spectral_centroid(S=S)
    features.append(np.mean(centroid))
    
    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(S=S)
    features.append(np.mean(bandwidth))
    
    # Spectral Contrast
    contrast = librosa.feature.spectral_contrast(S=S)
    features.extend(np.mean(contrast, axis=1))
    
    # Spectral Roll-Off (85%)
    rolloff = librosa.feature.spectral_rolloff(S=S, roll_percent=0.85)
    features.append(np.mean(rolloff))
    
    # Spectral Flatness
    flatness = librosa.feature.spectral_flatness(y=y)
    features.append(np.mean(flatness))
    
    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)
    features.append(np.mean(zcr))
    
    return np.array(features)

def get_records_and_labels(metadata_path, audio_dir, num_per_label=-1):
    df = pd.read_csv(metadata_path, sep='\t')
    df = df[df['age'].notna() & df['gender'].notna() & df['label'].notna()]

    balanced_df = df

    num_files = sum(1 for entry in os.scandir(audio_dir) if entry.is_file())
    print(f"Number of files in directory: {num_files}")
    print(f"Number of records in DataFrame: {len(df)}")

    with open("error_file_paths.txt", "r") as f:
        error_file_paths = f.read().splitlines()
    valid_indices = []
    for idx, row in tqdm(balanced_df.iterrows(), total=len(balanced_df), desc="Checking files"):
        file_path = os.path.join(audio_dir, row['path'])
        if os.path.exists(file_path) and file_path not in error_file_paths and row['down_votes'] == 0: 
            valid_indices.append(idx)

    balanced_df = balanced_df.loc[valid_indices]
    print(f"Records with existing files: {len(balanced_df)}")

    if num_per_label == -1:
        return balanced_df

    return balanced_df.groupby('label').apply(lambda x: x.sample(n=num_per_label, random_state=42) if len(x) >= num_per_label else x).reset_index(drop=True)

# Main Processing Pipeline
def process_dataset(df, input_audio_dir, output_audio_dir, output_csv):
    features_list = []
    labels = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
        filename = row['path']

        mp3_path = os.path.join(input_audio_dir, filename)
        wav_path = os.path.join(output_audio_dir, filename.replace(".mp3", ".wav"))
        mp3_to_wav(mp3_path, wav_path)
        
        # Preprocess
        y, sr = preprocess_audio(wav_path)
        
        # Extract features
        features = extract_features(y, sr)
        
        # Get label
        age_label = row['label']  
        
        features_list.append(features)
        labels.append(age_label)
    
    # Normalization
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_list)
    save_model(scaler, filename="scaler.joblib")
    
    # Save to CSV
    df = pd.DataFrame(scaled_features)
    df['label'] = labels
    df.to_csv(output_csv, index=False)

def save_model(model, save_dir="models", filename="model.joblib"):
    # Create directory if needed
    os.makedirs(save_dir, exist_ok=True)
    
    dump(model, os.path.join(save_dir, filename))

    print(f"Models saved to {save_dir} directory")

def load_models(model_dir="models", model_names=["knn_model.joblib", "lp_model.joblib", "scaler.joblib"]):
    models = {}
    for model_name in model_names:
        model_path = os.path.join(model_dir, model_name)
        if os.path.exists(model_path):
            models[model_name] = load(model_path)
        else:
            print(f"Model {model_name} not found in {model_dir}.")
    return models

def predict_age(audio_path, models_dir="models", model_names=["knn_model.joblib", "lp_model.joblib", "scaler.joblib"]):
    # Load models and scaler
    knn_model, lp_model, scaler = load_models(models_dir, model_names)
    
    # Preprocess audio
    y, sr = preprocess_audio(audio_path)
    
    # Extract features
    features = extract_features(y, sr)
    
    # Scale features
    scaled_features = scaler.transform([features])
    
    # Make predictions
    knn_pred = knn_model.predict(scaled_features)
    lp_pred = lp_model.predict(scaled_features)
    
    return {
        "knn_prediction": knn_pred[0],
        "lp_prediction": lp_pred[0]
    }

# Classification
def train_and_evaluate(csv_path): 
    # Load dataset
    data = pd.read_csv(csv_path)
    X = data.drop('label', axis=1).values
    y = data['label'].values  

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42)
    
    # Initialize classifiers
    knn = KNeighborsClassifier(n_neighbors=5)
    lp = LabelPropagation(kernel='knn', gamma=0.1)
    
    # Train and evaluate
    for clf, name in [(knn, 'KNN'), (lp, 'Label Propagation')]:
        print(f"Training {name}...")
        start_train = time.perf_counter()
        with tqdm(total=1, desc=f"Training {name}", leave=False) as pbar:
            clf.fit(X_train, y_train)
            pbar.update(1)
        train_duration = time.perf_counter() - start_train

        start_pred = time.perf_counter()
        with tqdm(total=1, desc=f"Predicting {name}", leave=False) as pbar:
            y_pred = clf.predict(X_test)
            pbar.update(1)
        pred_duration = time.perf_counter() - start_pred

        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Accuracy: {acc}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print(f"- Training time: {train_duration:.2f} seconds")
        print(f"- Prediction time: {pred_duration:.4f} seconds")

        save_model(clf, filename=f"{name}_model.joblib")

In [14]:
metadata_path = "..\\filtered_data_labeled.tsv"
audio_dir = "..\\filtered_clips"
output_audio_dir = "..\\wav_audio_dataset8000"
features_csv = "features8000.csv"

In [16]:
df = get_records_and_labels(metadata_path, audio_dir, num_per_label=2000)

Number of files in directory: 192727
Number of records in DataFrame: 222104


Checking files: 100%|██████████| 222104/222104 [01:02<00:00, 3538.01it/s]

Records with existing files: 152422



  return balanced_df.groupby('label').apply(lambda x: x.sample(n=num_per_label, random_state=42) if len(x) >= num_per_label else x).reset_index(drop=True)


In [None]:
process_dataset(df, audio_dir, output_audio_dir, features_csv)

Extracting features:   0%|          | 0/8000 [00:00<?, ?it/s]

In [None]:
train_and_evaluate(features_csv)

Training KNN...


                                                             

KNN Accuracy: 0.8282362246243079

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.72      0.76      1964
           1       0.89      0.85      0.87      2011
           2       0.80      0.87      0.83      1978
           3       0.81      0.88      0.85      1633

    accuracy                           0.83      7586
   macro avg       0.83      0.83      0.83      7586
weighted avg       0.83      0.83      0.83      7586

Confusion Matrix:
[[1409   60  397   98]
 [  62 1715   20  214]
 [ 218   19 1715   26]
 [  37  133   19 1444]]
- Training time: 0.01 seconds
- Prediction time: 1.0868 seconds
Models saved to models directory
Training Label Propagation...


                                                                           

Label Propagation Accuracy: 0.8253361455312418

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1964
           1       0.89      0.85      0.87      2011
           2       0.80      0.86      0.83      1978
           3       0.80      0.88      0.84      1633

    accuracy                           0.83      7586
   macro avg       0.83      0.83      0.82      7586
weighted avg       0.83      0.83      0.82      7586

Confusion Matrix:
[[1410   53  397  104]
 [  61 1714   23  213]
 [ 227   14 1701   36]
 [  36  144   17 1436]]
- Training time: 8.47 seconds
- Prediction time: 1.0511 seconds
Models saved to models directory


