<a href="https://colab.research.google.com/github/Tai2970/Accent-Recognition-Project/blob/main/scripts/full_pipeline_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa scikit-learn pandas numpy soundfile
!pip install praat-parselmouth --quiet



In [2]:
from google.colab import files
import os, shutil

# STEP 1: Upload model
uploaded = files.upload()  # Upload final_model.pkl or any renamed version

# STEP 2: Ensure 'models' directory exists
os.makedirs("models", exist_ok=True)

# STEP 3: Rename whatever got uploaded to 'final_model.pkl'
for filename in uploaded:
    new_path = os.path.join("models", "final_model.pkl")
    shutil.move(filename, new_path)
    print(f"Saved model as: {new_path}")

Saving final_model.pkl to final_model.pkl
Saved model as: models/final_model.pkl


In [3]:
# STEP 1: Delete any existing test_samples folder to avoid duplicates
import shutil
shutil.rmtree("data/test_samples", ignore_errors=True)

# STEP 2: Recreate the folder clean
os.makedirs("data/test_samples", exist_ok=True)

# STEP 3: Upload your new .wav files for prediction
from google.colab import files
uploaded = files.upload()

# STEP 4: Move uploaded .wav files to the 'test_samples' folder
for filename in uploaded:
    if filename.endswith(".wav"):
        shutil.move(filename, f"data/test_samples/{filename}")

Saving data_vietnamese_data_vietnamese_arctic_b0510.wav to data_vietnamese_data_vietnamese_arctic_b0510.wav


In [4]:
# Main prediction logic

import librosa
import numpy as np
import joblib
import pandas as pd
import parselmouth
import warnings
warnings.filterwarnings("ignore")

# Load trained model
model = joblib.load("models/final_model.pkl")

# Folder for test files
test_folder = "data/test_samples/"
accent_labels = {0: "American", 1: "British", 2: "Vietnamese"}

# Feature extraction: MFCCs, delta MFCCs, spectral contrast, pitch, formants
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)

    # MFCCs and Delta
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfccs = librosa.feature.delta(mfccs)

    # Spectral contrast
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Pitch
    snd = parselmouth.Sound(file_path)
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']
    pitch_values = pitch_values[pitch_values > 0]
    pitch_mean = np.mean(pitch_values) if len(pitch_values) > 0 else 0

    # Formants (F1–F3)
    formant = snd.to_formant_burg()
    num_frames = formant.get_number_of_frames()
    f1, f2, f3 = [], [], []
    for i in range(num_frames):
        t = formant.get_time_from_frame_number(i + 1)
        f1.append(formant.get_value_at_time(1, t) or 0)
        f2.append(formant.get_value_at_time(2, t) or 0)
        f3.append(formant.get_value_at_time(3, t) or 0)

    formant_means = [
        np.mean([v for v in f1 if v > 0]) if f1 else 0,
        np.mean([v for v in f2 if v > 0]) if f2 else 0,
        np.mean([v for v in f3 if v > 0]) if f3 else 0
    ]

    # Combine features into one vector
    return np.hstack((
        mfccs.mean(axis=1),
        delta_mfccs.mean(axis=1),
        contrast.mean(axis=1),
        [pitch_mean],
        formant_means
    ))

# Predict and display results
print("Accent Predictions:")
print("-" * 40)

for filename in sorted(os.listdir(test_folder)):
    if filename.endswith(".wav"):
        file_path = os.path.join(test_folder, filename)
        try:
            features = extract_features(file_path).reshape(1, -1)
            prediction = model.predict(features)[0]
            label = accent_labels[prediction]
            print(f"{filename} ➜ Predicted Accent: {label}")
        except Exception as e:
            print(f"{filename} ➜ Error: {e}")


Accent Predictions:
----------------------------------------
data_vietnamese_data_vietnamese_arctic_b0510.wav ➜ Predicted Accent: Vietnamese
