In [39]:
import os
import numpy as np
import pandas as pd
from pydub import AudioSegment
import noisereduce as nr
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
import time
from joblib import dump, load

# Step 1: Convert MP3 to WAV
def mp3_to_wav(input_path, output_path):
    audio = AudioSegment.from_mp3(input_path)
    audio.export(output_path, format="wav")

# Step 2: Preprocessing with Silence Removal
def preprocess_audio(file_path, sr=48000):
    # Load audio
    y, sr = librosa.load(file_path, sr=sr)
    
    # Remove silence
    reduced_noise = nr.reduce_noise(y=y, sr=sr, stationary=True)
    
    # Trim silent parts
    y_trimmed, _ = librosa.effects.trim(reduced_noise, top_db=20)
    
    return y_trimmed, sr

# Step 3: Feature Extraction (46 features)
def extract_features(y, sr=48000):
    features = []
    
    # MFCCs (40 coefficients)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    features.extend(np.mean(mfccs, axis=1))
    
    # Spectral Features
    S = np.abs(librosa.stft(y))
    
    # Spectral Centroid
    centroid = librosa.feature.spectral_centroid(S=S)
    features.append(np.mean(centroid))
    
    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(S=S)
    features.append(np.mean(bandwidth))
    
    # Spectral Contrast
    contrast = librosa.feature.spectral_contrast(S=S)
    features.append(np.mean(contrast))
    
    # Spectral Roll-Off (85%)
    rolloff = librosa.feature.spectral_rolloff(S=S, roll_percent=0.85)
    features.append(np.mean(rolloff))
    
    # Spectral Flatness
    flatness = librosa.feature.spectral_flatness(y=y)
    features.append(np.mean(flatness))
    
    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)
    features.append(np.mean(zcr))
    
    return np.array(features)

def get_records_and_labels(metadata_path, audio_dir, num_per_label=-1):
    df = pd.read_csv(metadata_path, sep='\t')
    df = df[df['age'].notna() & df['gender'].notna() & df['label'].notna()]

    balanced_df = df

    num_files = sum(1 for entry in os.scandir(audio_dir) if entry.is_file())
    print(f"Number of files in directory: {num_files}")
    print(f"Number of records in DataFrame: {len(df)}")

    with open("error_file_paths.txt", "r") as f:
        error_file_paths = f.read().splitlines()
    valid_indices = []
    for idx, row in tqdm(balanced_df.iterrows(), total=len(balanced_df), desc="Checking files"):
        file_path = os.path.join(audio_dir, row['path'])
        if os.path.exists(file_path) and file_path not in error_file_paths and row['down_votes'] == 0:
            valid_indices.append(idx)

    balanced_df = balanced_df.loc[valid_indices]
    print(f"Records with existing files: {len(balanced_df)}")

    if num_per_label == -1:
        return balanced_df

    return balanced_df.groupby('label').apply(lambda x: x.sample(n=num_per_label, random_state=42) if len(x) >= num_per_label else x).reset_index(drop=True)

# Main Processing Pipeline
def process_dataset(df, input_audio_dir, output_audio_dir, output_csv):
    features_list = []
    labels = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
        filename = row['path']

        mp3_path = os.path.join(input_audio_dir, filename)
        wav_path = os.path.join(output_audio_dir, filename.replace(".mp3", ".wav"))
        mp3_to_wav(mp3_path, wav_path)
        
        # Preprocess
        y, sr = preprocess_audio(wav_path)
        
        # Extract features
        features = extract_features(y, sr)
        
        # Get label
        age_label = row['label']  
        
        features_list.append(features)
        labels.append(age_label)

        os.remove(wav_path)  # Remove WAV file after processing
    
    # Normalization
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_list)
    save_model(scaler, filename="scaler.joblib")
    
    # Save to CSV
    df = pd.DataFrame(scaled_features)
    df['label'] = labels
    df.to_csv(output_csv, index=False)

def save_model(model, save_dir="models", filename="model.joblib"):
    # Create directory if needed
    os.makedirs(save_dir, exist_ok=True)
    
    dump(model, os.path.join(save_dir, filename))

    print(f"Models saved to {save_dir} directory")

def load_models(model_dir="models", model_names=["knn_model.joblib", "lp_model.joblib", "scaler.joblib"]):
    models = {}
    for model_name in model_names:
        model_path = os.path.join(model_dir, model_name)
        if os.path.exists(model_path):
            models[model_name] = load(model_path)
        else:
            print(f"Model {model_name} not found in {model_dir}.")
    return models

def predict_age(audio_path, models_dir="models", model_names=["knn_model.joblib", "lp_model.joblib", "scaler.joblib"]):
    # Load models and scaler
    knn_model, lp_model, scaler = load_models(models_dir, model_names)
    
    # Preprocess audio
    y, sr = preprocess_audio(audio_path)
    
    # Extract features
    features = extract_features(y, sr)
    
    # Scale features
    scaled_features = scaler.transform([features])
    
    # Make predictions
    knn_pred = knn_model.predict(scaled_features)
    lp_pred = lp_model.predict(scaled_features)
    
    return {
        "knn_prediction": knn_pred[0],
        "lp_prediction": lp_pred[0]
    }

# Classification
def train_and_evaluate(csv_path): 
    # Load dataset
    data = pd.read_csv(csv_path)
    X = data.drop('label', axis=1).values
    y = data['label'].values  

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42)
    
    # Initialize classifiers
    knn = KNeighborsClassifier(n_neighbors=5)
    lp = LabelPropagation(kernel='knn', gamma=0.1)
    # svm = SVC(kernel='rbf', C=1000, gamma=0.0001, random_state=42)
    
    # Train and evaluate
    for clf, name in [(knn, 'KNN'), (lp, 'Label Propagation')]:
        print(f"Training {name}...")
        start_train = time.perf_counter()
        clf.fit(X_train, y_train)
        train_duration = time.perf_counter() - start_train

        start_pred = time.perf_counter()
        y_pred = clf.predict(X_test)
        pred_duration = time.perf_counter() - start_pred

        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Accuracy: {acc}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print(f"- Training time: {train_duration:.2f} seconds")
        print(f"- Prediction time: {pred_duration:.4f} seconds")

        save_model(clf, filename=f"{name}_model.joblib")

In [40]:
metadata_path = "..\\filtered_data_labeled.tsv"
audio_dir = "..\\filtered_clips"
output_audio_dir = "..\\wav_audio_dataset180000"
features_csv = "features80000_46_0downVotes.csv"

In [41]:
df = get_records_and_labels(metadata_path, audio_dir, num_per_label=20000)

Number of files in directory: 192727
Number of records in DataFrame: 222104


Checking files: 100%|██████████| 222104/222104 [00:59<00:00, 3727.26it/s]

Records with existing files: 152422



  return balanced_df.groupby('label').apply(lambda x: x.sample(n=num_per_label, random_state=42) if len(x) >= num_per_label else x).reset_index(drop=True)


In [29]:
all_df = get_records_and_labels(metadata_path, audio_dir)

Number of files in directory: 192727
Number of records in DataFrame: 222104


Checking files: 100%|██████████| 222104/222104 [01:01<00:00, 3601.04it/s]

Records with existing files: 181879





In [42]:
df.groupby('label').count()

Unnamed: 0_level_0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,20000,20000,20000,20000,20000,20000,20000,13942
1,18406,18406,18406,18406,18406,18406,18406,14948
2,17028,17028,17028,17028,17028,17028,17028,14459
3,13894,13894,13894,13894,13894,13894,13894,11357


In [38]:
# Filter records where up_votes > down_votes
filtered_df = all_df[all_df['down_votes'] == 1]

# Group by label and count
filtered_df.groupby('up_votes').count()

Unnamed: 0_level_0,client_id,path,sentence,down_votes,age,gender,accent,label
up_votes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,26055,26055,26055,26055,26055,26055,17144,26055
3,1574,1574,1574,1574,1574,1574,1170,1574
4,347,347,347,347,347,347,270,347
5,115,115,115,115,115,115,102,115
6,83,83,83,83,83,83,67,83
7,46,46,46,46,46,46,38,46
8,46,46,46,46,46,46,40,46
9,21,21,21,21,21,21,20,21
10,18,18,18,18,18,18,18,18
11,11,11,11,11,11,11,10,11


In [43]:
process_dataset(df, audio_dir, output_audio_dir, features_csv)

Extracting features:  61%|██████    | 42267/69328 [2:51:43<1:49:56,  4.10it/s]


KeyboardInterrupt: 

In [7]:
train_and_evaluate(features_csv)

Training KNN...
KNN Accuracy: 0.9093358258192215

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     12350
           1       0.88      0.83      0.85      2204
           2       0.87      0.70      0.78      1971
           3       0.84      0.82      0.83      1663

    accuracy                           0.91     18188
   macro avg       0.88      0.83      0.85     18188
weighted avg       0.91      0.91      0.91     18188

Confusion Matrix:
[[11964    84   204    98]
 [  224  1824     1   155]
 [  559    10  1389    13]
 [  150   149     2  1362]]
- Training time: 0.06 seconds
- Prediction time: 6.5666 seconds
Models saved to models directory
Training Label Propagation...
Label Propagation Accuracy: 0.9081812183857488

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     12350
           1       0.89      0.82      0.85      2204
    