In [1]:
import os
import json
import time
import csv
import whisper
import pandas as pd
from tqdm import tqdm

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import jiwer
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein

Load Whisper model

In [4]:
def load_whisper_model(model_size="base"):
    return whisper.load_model(model_size)

Get list of files in the directory

In [5]:
def get_files_in_directory(folder_path, file_extension):
    return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

Load transcription data from JSON file

In [6]:
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

Get matching transcription from JSON

In [7]:
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)

Transcribe audio and measure time

In [8]:
def transcribe_audio_with_time(model, wav_file_path):
    start_time = time.time()
    result = model.transcribe(wav_file_path)
    end_time = time.time()
    transcription_time = end_time - start_time
    return result['text'], transcription_time

Calculate Word Error Rate (WER)

In [9]:
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

Calculate Character Error Rate (CER)

In [10]:
def calculate_cer(reference, hypothesis):
    return jiwer.cer(reference, hypothesis)

Calculate Sentence Error Rate (SER)

In [11]:
def calculate_ser(reference_sentences, hypothesis_sentences):
    incorrect_sentences = sum([ref != hyp for ref, hyp in zip(reference_sentences, hypothesis_sentences)])
    return incorrect_sentences / len(reference_sentences)

Calculate BLEU Score

In [12]:
def calculate_bleu(reference, hypothesis):
    reference = [reference.split()]
    hypothesis = hypothesis.split()
    return sentence_bleu(reference, hypothesis)

Calculate Levenshtein Distance

In [13]:
def calculate_levenshtein(reference, hypothesis):
    return Levenshtein.distance(reference, hypothesis)

Calculate Cosine Similarity

In [14]:
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim

Function to write results to a CSV file using pandas

In [15]:
def write_results_to_csv_pandas(csv_file_path, results):
    df = pd.DataFrame(results, columns=["File", "Whisper Output", "Correct Transcription", "WER", "CER", "SER", "BLEU", "Levenshtein Distance", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)

Process files and save the result to CSV with tqdm progress bar

In [16]:
def process_files_and_save_to_csv_pandas(folder_path, model, csv_file_path):
    # Get list of files
    wav_files = get_files_in_directory(folder_path, '.wav')
    json_files = get_files_in_directory(folder_path, '.json')
    
    results = []
    
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):  # Add progress bar here
        # Load the transcription data from JSON
        json_path = os.path.join(folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        
        base_name = json_file.replace("_transcription.json", "")
        
        for i in tqdm(range(1, 6), desc=f"Processing {json_file}", leave=False, unit="sample"):  # Inner loop progress bar
            wav_file = f"{base_name}_{i}.wav"
            wav_path = os.path.join(folder_path, wav_file)
            
            if not os.path.exists(wav_path):
                print(f"Audio file {wav_file} not found.")
                continue
            
            sample_name = f"sample_{i}"
            correct_transcription = get_matching_transcription(transcription_data, sample_name)
            
            if not correct_transcription:
                print(f"No matching transcription found for {wav_file} in {json_file}.")
                continue
            
            # Transcribe the audio and measure time
            whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path)
            
            # Calculate WER, CER, SER, BLEU, Levenshtein Distance, and Cosine Similarity
            wer = calculate_wer(correct_transcription, whisper_output)
            cer = calculate_cer(correct_transcription, whisper_output)
            ser = calculate_ser([correct_transcription], [whisper_output])
            bleu = calculate_bleu(correct_transcription, whisper_output)
            levenshtein_distance = calculate_levenshtein(correct_transcription, whisper_output)
            cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
            
            # Store result in the list
            results.append([wav_file, whisper_output, correct_transcription, wer, cer, ser, bleu, levenshtein_distance, cosine_sim, transcription_time])
    
    # Write results to CSV using pandas
    write_results_to_csv_pandas(csv_file_path, results)

In [17]:
#folder_path = "Male British (Oliver)"
#csv_file_path = "Oliver_base.csv"

In [18]:
#folder_path = "Female British (Madelyn)"
#csv_file_path = "Madelyn_base.csv"

In [19]:
folder_path = "Data/voices/Female American (Nava)"
csv_file_path = "Nava_base.csv"

In [20]:
#folder_path = "Male American (Michael)"
#csv_file_path = "Michael_base.csv"

In [21]:
model = load_whisper_model("base")

  checkpoint = torch.load(fp, map_location=device)


In [22]:
import warnings

# Suppress the FP16 warning
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

In [23]:
process_files_and_save_to_csv_pandas(folder_path, model, csv_file_path)

Processing JSON files: 100%|██████████| 5/5 [00:46<00:00,  9.21s/file]
