In [1]:
import whisper

In [2]:
model = whisper.load_model("medium")

In [5]:
import evaluate

In [6]:
import Levenshtein

def text_similarity_evaluation(labels, preds, threshold=0.8):
    tp, fp, fn = 0, 0, 0

    for label, pred in zip(labels, preds):
        similarity_score = 1 - Levenshtein.distance(label, pred) / max(len(label), len(pred))
        if similarity_score >= threshold:
            tp += 1
        else:
            fp += 1

    fn = len(labels) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

bleu_metric = evaluate.load("evaluate-metric/bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
import json

with open("test_samples.json", "r") as f:
    test_samples = json.load(f)

In [22]:
from tqdm.auto import tqdm

def evaluate(data_path, save_path):
    predictions = []
    references = []
    for i in tqdm(range(len(test_samples))):
        result = model.transcribe(f"{data_path}/{i + 1}.wav")
        
        predictions.append(result["text"])
        references.append(test_samples[i]["text"])
    
    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    precision, recall, f1_score = text_similarity_evaluation(references, predictions)
    
    results = {
        "bleu": bleu_score,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }
    
    with open(save_path, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

In [23]:
evaluate("hf_tts_inferences", "hf_tts_evaluation.json")

  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
evaluate("pytorch_pretrained_inferences", "pytorch_tts_evaluation.json")

  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
evaluate("custom_TTS/Transformer-TTS/samples", "own_tts_evaluation.json")

  0%|          | 0/100 [00:00<?, ?it/s]