In [4]:
import os
import librosa
import numpy as np
import pandas as pd

# === Settings ===
filler_dir = "E:\\FineTuned Filler Model\\DSet\\Labled\\filler"
non_filler_dir = "E:\\FineTuned Filler Model\\DSet\\Labled\\non_filler"

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)

    # Skip clips shorter than 0.3s or silent
    if len(y) < int(0.3 * sr) or np.all(y == 0):
        raise ValueError("Audio too short or silent")

    try:
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        delta = librosa.feature.delta(mfcc)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rms = librosa.feature.rms(y=y)

        # Check all shapes before proceeding
        if any(f.shape[1] == 0 for f in [mfcc, delta, chroma]) or spec_centroid.shape[1] == 0:
            raise ValueError("One or more features are empty")

        feature_vector = np.concatenate([
            np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
            np.mean(delta, axis=1), np.std(delta, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            [np.mean(spec_centroid), np.std(spec_centroid)],
            [np.mean(zcr), np.std(zcr)],
            [np.mean(rms), np.std(rms)]
        ])
        return feature_vector

    except Exception as e:
        raise ValueError(f"Feature extraction failed: {e}")



# === Prepare data ===
data = []
labels = []

# Filler
for file in os.listdir(filler_dir):
    if file.endswith(".wav"):
        path = os.path.join(filler_dir, file)
        try:
            features = extract_features(path)
            data.append(features)
            labels.append(1)
        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

# Non-Filler
for file in os.listdir(non_filler_dir):
    if file.endswith(".wav"):
        path = os.path.join(non_filler_dir, file)
        try:
            features = extract_features(path)
            data.append(features)
            labels.append(0)
        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

# === Save as DataFrame ===
df = pd.DataFrame(data)
df["label"] = labels
df.to_csv("filler_features.csv", index=False)
print("✅ Saved features to filler_features.csv")

❌ Error processing audio1_filler_2.wav: Audio too short or silent
❌ Error processing audio1_filler_3.wav: Audio too short or silent
❌ Error processing audio1_filler_9.wav: Audio too short or silent
❌ Error processing audio38_filler_12.wav: Audio too short or silent
❌ Error processing audio38_filler_13.wav: Audio too short or silent
❌ Error processing audio38_filler_14.wav: Audio too short or silent
❌ Error processing audio38_filler_15.wav: Audio too short or silent
❌ Error processing audio38_filler_19.wav: Audio too short or silent
❌ Error processing audio38_filler_20.wav: Audio too short or silent
❌ Error processing audio38_filler_24.wav: Audio too short or silent
❌ Error processing audio39_filler_4.wav: Audio too short or silent
❌ Error processing audio43_filler_16.wav: Audio too short or silent
❌ Error processing audio43_filler_21.wav: Audio too short or silent
❌ Error processing audio43_filler_22.wav: Audio too short or silent
❌ Error processing audio43_filler_6.wav: Audio too shor

  return pitch_tuning(


❌ Error processing audio38_nonfiller_134.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_177.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_202.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_203.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_216.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_226.wav: Audio too short or silent
❌ Error processing audio38_nonfiller_273.wav: Audio too short or silent
✅ Saved features to filler_features.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# === Load data ===
df = pd.read_csv("filler_features.csv")
X = df.drop("label", axis=1)
y = df["label"]

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === Train MLP Classifier ===
clf = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
clf.fit(X_train, y_train)

# === Evaluate ===
y_pred = clf.predict(X_test)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# === Save Model ===
joblib.dump(clf, "filler_detector_model.pkl")
print("\n✅ Model saved as filler_detector_model.pkl")


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       562
           1       0.83      0.85      0.84       143

    accuracy                           0.93       705
   macro avg       0.90      0.90      0.90       705
weighted avg       0.94      0.93      0.94       705


=== Confusion Matrix ===
[[537  25]
 [ 21 122]]

✅ Model saved as filler_detector_model.pkl


In [7]:
import os
import librosa
import numpy as np
import joblib
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler

model = joblib.load("filler_detector_model.pkl")

# === Settings ===
audio_path = "E:\\FineTuned Filler Model\\DSet\\Raw\\audio4.wav"  
chunk_duration_sec = 1.0  

# === Feature extraction
def extract_features(y, sr):
    if len(y) < int(0.3 * sr) or np.all(y == 0):
        return None

    try:
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        delta = librosa.feature.delta(mfcc)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rms = librosa.feature.rms(y=y)

        if any(f.shape[1] == 0 for f in [mfcc, delta, chroma]) or spec_centroid.shape[1] == 0:
            return None

        features = np.concatenate([
            np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
            np.mean(delta, axis=1), np.std(delta, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            [np.mean(spec_centroid), np.std(spec_centroid)],
            [np.mean(zcr), np.std(zcr)],
            [np.mean(rms), np.std(rms)]
        ])
        return features
    except:
        return None

# === Load audio and chunk ===
y, sr = librosa.load(audio_path, sr=16000)
total_duration = librosa.get_duration(y=y, sr=sr)
chunk_size = int(chunk_duration_sec * sr)

filler_count = 0
total_chunks = 0

for i in range(0, len(y), chunk_size):
    chunk = y[i:i+chunk_size]
    if len(chunk) < int(0.3 * sr):
        continue  # too short, skip

    features = extract_features(chunk, sr)
    if features is not None:
        prediction = model.predict([features])[0]
        total_chunks += 1
        if prediction == 1:
            filler_count += 1

print(f"\n✅ Total chunks analyzed: {total_chunks}")
print(f"🗣️ Estimated filler words in clip: {filler_count}")




✅ Total chunks analyzed: 214
🗣️ Estimated filler words in clip: 6




In [2]:
import os
import datetime

timestamp = os.path.getmtime("filler_detector_model.pkl")
print("📂 Model last modified on:", datetime.datetime.fromtimestamp(timestamp))


📂 Model last modified on: 2025-08-04 09:50:29.177888
