In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
%cd /content/drive/MyDrive/cs679_project_compare_40/code/compare_graphs/numeric_compare/wavs/p225x228_001

/content/drive/MyDrive/cs679_project_compare_40/code/compare_graphs/numeric_compare/wavs/p225x228_001


In [15]:
!pip install librosa pesq jiwer



In [16]:
import librosa
import numpy as np

def compute_mcd(file1, file2):
    y1, sr1 = librosa.load(file1, sr=None)
    y2, sr2 = librosa.load(file2, sr=None)

    # Ensure same sample rate for both files
    if sr1 != sr2:
        y2 = librosa.resample(y2, orig_sr=sr2, target_sr=sr1)

    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr1)

    # if shape is different
    if mfcc1.shape[1] > mfcc2.shape[1]:
        mfcc1 = mfcc1[:, :mfcc2.shape[1]]
    elif mfcc2.shape[1] > mfcc1.shape[1]:
        mfcc2 = mfcc2[:, :mfcc1.shape[1]]

    # Compute the Euclidean distance between the two MFCC feature matrices
    mcd_value = np.mean(np.sqrt(np.sum((mfcc1 - mfcc2) ** 2, axis=0)))
    return mcd_value


In [17]:
from pesq import pesq

def compute_pesq(reference_file, degraded_file, fs):
    """
    Compute the PESQ score.
    fs: sampling rate, use 16000 for wideband or 8000 for narrowband.
    """
    ref, _ = librosa.load(reference_file, sr=fs)
    deg, _ = librosa.load(degraded_file, sr=fs)
    score = pesq(fs, ref, deg, 'wb' if fs == 16000 else 'nb')
    return score


In [18]:
from jiwer import wer

def compute_wer(transcription_original, transcription_transferred):
    error_rate = wer(transcription_original, transcription_transferred)
    return error_rate


In [19]:
!pip install pystoi



In [20]:
import soundfile as sf
from pystoi.stoi import stoi

def calculate_stoi(original_path, transferred_path):
    # Load the original and transferred speech files
    original_signal, fs_orig = sf.read(original_path)
    transferred_signal, fs_trans = sf.read(transferred_path)

    # Ensure both files have the same sampling rate
    if fs_orig != fs_trans:
        transferred_signal = librosa.resample(transferred_signal, orig_sr=fs_trans, target_sr=fs_orig)
        # raise ValueError("Sampling rates do not match. STOI requires both signals to have the same sampling rate.")

    # Check if both signals are of the same length; pad with zeros if not
    if len(original_signal) != len(transferred_signal):
        max_len = max(len(original_signal), len(transferred_signal))
        original_signal = np.pad(original_signal, (0, max_len - len(original_signal)), 'constant')
        transferred_signal = np.pad(transferred_signal, (0, max_len - len(transferred_signal)), 'constant')

    # Calculate the STOI score
    stoi_score = stoi(original_signal, transferred_signal, fs_orig, extended=False)

    return stoi_score

In [21]:
import soundfile as sf
def calculate_mse(original_path, transferred_path):
    # Load the original and transferred speech files
    original_signal, fs_orig = sf.read(original_path)
    transferred_signal, fs_trans = sf.read(transferred_path)

    # Ensure both files have the same sampling rate
    if fs_orig != fs_trans:
        transferred_signal = librosa.resample(transferred_signal, orig_sr=fs_trans, target_sr=fs_orig)

    # Check if both signals are of the same length; pad with zeros if not
    if len(original_signal) != len(transferred_signal):
        max_len = max(len(original_signal), len(transferred_signal))
        original_signal = np.pad(original_signal, (0, max_len - len(original_signal)), 'constant')
        transferred_signal = np.pad(transferred_signal, (0, max_len - len(transferred_signal)), 'constant')

    # Calculate the Mean Squared Error
    mse = np.mean((original_signal - transferred_signal) ** 2)

    return mse

In [22]:
!pip install pyAudioAnalysis



# Methods

**MCD**: Mel-Cepstral Distortion (MCD)

**PESQ**: Perceptual Evaluation of Speech Quality (PESQ)

**WER**: Word Error Rate (WER)

**STOI**: The Short-Time Objective Intelligibility (STOI)

**MSE**: Mean Square Error

## Description

**MCD** is a measure of the spectral difference between two audio signals, typically used to assess the quality of voice conversion systems. Lower values indicate higher similarity between the audio samples.


**PESQ** scores range from -0.5 to 4.5, where higher scores indicate better speech quality. Scores above 3 are generally considered good, indicating high perceptual quality, while scores below 2 suggest poor quality, with noticeable distortions or degradation.


**WER** is a measure of the transcription error rate, calculated as the ratio of the number of errors (insertions, deletions, substitutions) to the total number of words spoken. WER values range from 0 to 1 (or 0% to 100%), where 0 indicates perfect transcription with no errors, and higher values indicate more errors.


**STOI** measures the intelligibility of speech signals and is particularly useful in evaluating the clarity of speech after style transfer. Compute the STOI score between the original and the style-transferred speech. Higher scores indicate better intelligibility.

In practical terms:

Above 0.75: Generally considered good intelligibility. Listeners are likely to understand the speech without significant effort.
0.6 to 0.75: Moderate intelligibility. While listeners can understand the speech, they might need to pay more attention, and some parts of the speech might be harder to comprehend.
Below 0.6: Poor intelligibility. Listeners may struggle to understand the speech, and there could be significant distortion or loss of information.

**MSE** quantifies the average squared difference between the values of the original and transferred signals, essentially measuring the variance between these two datasets

# Make a table

In [23]:
import pandas as pd
def main(original_file, transferred_file):
    # Compute MCD
    mcd_value = compute_mcd(original_file, transferred_file)

    # Compute PESQ
    pesq_score = compute_pesq(original_file, transferred_file, 16000)

    # STOI
    stoi_score = calculate_stoi(original_file, transferred_file)

    # MSE
    mse_score = calculate_mse(original_file, transferred_file)

    return {
        "MCD": mcd_value,
        "PESQ": pesq_score,
        "STOI": stoi_score,
        "MSE": mse_score
    }

def evaluate_files(original_file, transferred_files, row_names):
    results = []

    for i in range(len(transferred_files)):
        metrics = main(original_file, transferred_files[i])
        metrics['Speaker Encoder'] = row_names[i]  # Add the file name for reference
        results.append(metrics)

    # Create a DataFrame
    df = pd.DataFrame(results)
    df.set_index('Speaker Encoder', inplace=True)
    return df

In [27]:
file_location = ''
original_file = file_location+'p228_001.wav'
transferred_files = [file_location+'p225xp228_original.wav',
                     file_location+'p225xp228_facebook.wav',
                     file_location+'p225xp228_speechbrain.wav',
                     file_location+'convgru_p225xp228.wav']
row_names = ['Zero-Shot','Facebook','SpeechBrain','ConVGRU']

df = evaluate_files(original_file, transferred_files, row_names)
df

Unnamed: 0_level_0,MCD,PESQ,STOI,MSE
Speaker Encoder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zero-Shot,176.985123,1.036756,0.444893,0.017349
Facebook,204.817383,1.032902,0.201668,0.011772
SpeechBrain,169.422028,1.151512,0.428287,0.031895
ConVGRU,242.301758,1.040815,-0.112738,0.00927


In [28]:
file_location = ''
original_file = file_location+'p225_001.wav'
transferred_files = [file_location+'p225xp228_original.wav',
                     file_location+'p225xp228_facebook.wav',
                     file_location+'p225xp228_speechbrain.wav',
                     file_location+'convgru_p225xp228.wav']
row_names = ['Zero-Shot','Facebook','SpeechBrain','ConVGRU']

df = evaluate_files(original_file, transferred_files, row_names)
df

Unnamed: 0_level_0,MCD,PESQ,STOI,MSE
Speaker Encoder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zero-Shot,203.864807,1.252055,0.068479,0.029182
Facebook,191.636627,1.040063,0.169162,0.021986
SpeechBrain,186.297455,1.0309,0.352681,0.044421
ConVGRU,167.886703,1.076959,0.585474,0.01975


In [None]:
data = {
    "MCD": [190.872009, 188.664474, 249.722000, 169.612961],
    "PESQ": [1.095009, 1.057051, 1.046293, 1.097576],
    "STOI": [0.160352, 0.175409, 0.056638, 0.576549],
    "MSE": [0.031352, 0.024982, 0.018245, 0.020387]
}

# Create the DataFrame
df = pd.DataFrame(data, index=["Zero-Shot", "Facebook", "SpeechBrain", "ConVGRU"])

# Convert to LaTeX
latex_code = df.to_latex()

print(latex_code)

$$
\begin{tabular}{lrrrr}
\toprule
 & MCD & PESQ & STOI & MSE \\
\midrule
Zero-Shot & 190.872009 & 1.095009 & 0.160352 & 0.031352 \\
Facebook & 188.664474 & 1.057051 & 0.175409 & 0.024982 \\
SpeechBrain & 249.722000 & 1.046293 & 0.056638 & 0.018245 \\
ConVGRU & 169.612961 & 1.097576 & 0.576549 & 0.020387 \\
\bottomrule
\end{tabular}
$$

In [None]:
break

# Below are combination of above

In [None]:
def main(original_file, transferred_file):
    # Compute MCD
    mcd_value = compute_mcd(original_file, transferred_file)
    print(f"MCD: {mcd_value}")

    # Compute PESQ
    # Replace '16000' with '8000' if your audio is narrowband.
    pesq_score = compute_pesq(original_file, transferred_file, 16000)
    print(f"PESQ: {pesq_score}")

    # STOI
    stoi_score = calculate_stoi(original_file, transferred_file)
    print(f"STOI score: {stoi_score}")

    # MSE
    mse_score = calculate_mse(original_file, transferred_file)
    print(f"MSE score: {mse_score}")
    return [mcd_value, pesq_score, stoi_score, mse_score]


In [None]:
main('wavs/p256_003/p256_003.wav', 'wavs/p256_003/p225xp256.wav')

In [None]:
main('wavs/p228_003/p228_003.wav', 'wavs/p228_003/p225xp228.wav')

In [None]:
main('wavs/p225_003/p225_003.wav', 'wavs/p225_003/p225xp228.wav')

In [None]:
main('wavs/p225_003/p225_003.wav', 'wavs/p225_003/p225xp225.wav')

In [None]:
main('wavs/p225_001/p225_001.wav', 'wavs/p225_001/p225xp225.wav')