In [1]:
import torch
print(torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

True
cuda


In [2]:
import os
import json
import time
import csv
import whisper
import pandas as pd
from tqdm import tqdm

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
import jiwer
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein

Load Whisper model

In [5]:
def load_whisper_model(model_size,device):
    return whisper.load_model(model_size,device)

Get list of files in the directory

In [6]:
def get_files_in_directory(folder_path, file_extension, keyword=None):
    if keyword:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension) and keyword in f])
    else:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

Load transcription data from JSON file

In [7]:
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

Get matching transcription from JSON

In [8]:
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)

Transcribe audio and measure time

In [9]:
def transcribe_audio_with_time(model, wav_file_path,device="cuda"):
    start_time = time.time()
    result = model.transcribe(wav_file_path)
    end_time = time.time()
    transcription_time = end_time - start_time
    return result['text'], transcription_time

Calculate Word Error Rate (WER)

In [10]:
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

Calculate Character Error Rate (CER)

In [11]:
def calculate_cer(reference, hypothesis):
    return jiwer.cer(reference, hypothesis)

Calculate Sentence Error Rate (SER)

In [12]:
def calculate_ser(reference_sentences, hypothesis_sentences):
    incorrect_sentences = sum([ref != hyp for ref, hyp in zip(reference_sentences, hypothesis_sentences)])
    return incorrect_sentences / len(reference_sentences)

Calculate BLEU Score

In [13]:
def calculate_bleu(reference, hypothesis):
    reference = [reference.split()]
    hypothesis = hypothesis.split()
    return sentence_bleu(reference, hypothesis)

Calculate Levenshtein Distance

In [14]:
def calculate_levenshtein(reference, hypothesis):
    return Levenshtein.distance(reference, hypothesis)

Calculate Cosine Similarity

In [15]:
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim

Function to write results to a CSV file using pandas

In [16]:
def write_results_to_csv_pandas(csv_file_path, results):
    # Adjusting the DataFrame to match the actual number of columns in results
    df = pd.DataFrame(results, columns=["File", "Whisper Output", "Correct Transcription", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)

Process files and save the result to CSV with tqdm progress bar

In [17]:
def process_files_and_save_to_csv_pandas(audio_folder_path, transcription_folder_path, model, csv_file_path):
    # Get list of audio files and transcription JSON files from their respective directories
    wav_files = get_files_in_directory(audio_folder_path, '.wav')
    json_files = get_files_in_directory(transcription_folder_path, '.json', keyword='_transcription')
    
    results = []
    
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):  # Add progress bar here
        # Load the transcription data from JSON
        json_path = os.path.join(transcription_folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        
        # Determine if the file is "with pause" or "no pause"
        is_with_pause = "_with_pause" in json_file
        
        # Create the base name based on the transcription file
        base_name = json_file.replace("_transcription_with_pause", "").replace("_transcription", "").replace(".json", "")
        
        #print(f"Processing {json_file} with base name: {base_name}, with pause: {is_with_pause}")
        
        for i in tqdm(range(1, 6), desc=f"Processing {json_file}", leave=False, unit="sample"):  # Inner loop progress bar
            # Construct the expected audio file name
            if is_with_pause:
                wav_file = f"{base_name}_with_pause_{i}.wav"
            else:
                wav_file = f"{base_name}_{i}.wav"
            
            wav_path = os.path.join(audio_folder_path, wav_file)
            
            #print(f"Looking for audio file: {wav_file}")
            
            if not os.path.exists(wav_path):
                print(f"Audio file {wav_file} not found.")
                continue
            
            sample_name = f"sample_{i}"
            correct_transcription = get_matching_transcription(transcription_data, sample_name)
            
            if not correct_transcription:
                print(f"No matching transcription found for {wav_file} in {json_file}.")
                continue
            
            # Transcribe the audio and measure time
            whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path, device="cuda")
            
            # Calculate only cosine similarity (or other metrics as needed)
            cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
            
            # Store result in the list
            results.append([wav_file, whisper_output, correct_transcription, cosine_sim, transcription_time])
    
    # Write results to CSV using pandas
    write_results_to_csv_pandas(csv_file_path, results)


In [18]:
model = load_whisper_model("small.en",device)

100%|███████████████████████████████████████| 461M/461M [02:49<00:00, 2.85MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [19]:
folder_path1 = "../Voices/Female American (Nova)/no pause"
transcription_folder_path1 = "../Voices/Transcription/no pause"
csv_file_path1 = "processed/Nova_no_pause.csv"

In [20]:
folder_path2 = "../Voices/Female American (Nova)/with pause/"
transcription_folder_path2 = "../Voices/Transcription/with pause"
csv_file_path2 = "processed/Nova_with_pause.csv"

In [21]:
folder_path3 = "../Voices/Female American (Nova) with Noise/no pause/"
transcription_folder_path3 = "../Voices/Transcription/no pause"
csv_file_path3 = "processed/Nova_Noise_no_pause.csv"

In [22]:
folder_path4 = "../Voices/Female American (Nova) with Noise/with pause/"
transcription_folder_path4 = "../Voices/Transcription/with pause"
csv_file_path4 = "processed/Nova_Noise_with_pause.csv"

In [23]:
folder_path5 = "../Voices/Female British (Madelyn)/no pause/"
transcription_folder_path5 = "../Voices/Transcription/no pause"
csv_file_path5 = "processed/Madelyn_no_pause.csv"

In [24]:
folder_path6 = "../Voices/Female British (Madelyn)/with pause/"
transcription_folder_path6 = "../Voices/Transcription/with pause"
csv_file_path6 = "processed/Madelyn_with_pause.csv"

In [25]:
folder_path7 = "../Voices/Female British (Madelyn) with Noise/with pause/"
transcription_folder_path7 = "../Voices/Transcription/with pause"
csv_file_path7 = "processed/Madelyn_Noise_with_pause.csv"

In [26]:
folder_path8 = "../Voices/Female British (Madelyn) with Noise/no pause/"
transcription_folder_path8 = "../Voices/Transcription/no pause"
csv_file_path8 = "processed/Madelyn_Noise_no_pause.csv"

In [27]:
folder_path9 = "../Voices/Male American (Michael)/no pause/"
transcription_folder_path9 = "../Voices/Transcription/no pause"
csv_file_path9 = "processed/Michael_no_pause.csv"

In [28]:
folder_path10 = "../Voices/Male American (Michael)/with pause/"
transcription_folder_path10 = "../Voices/Transcription/with pause"
csv_file_path10 = "processed/Michael_with_pause.csv"

In [29]:
folder_path11 = "../Voices/Male American (Michael) with Noise/no pause/"
transcription_folder_path11 = "../Voices/Transcription/no pause"
csv_file_path11 = "processed/Michael_Noise_no_pause.csv"

In [30]:
folder_path12 = "../Voices/Male American (Michael) with Noise/with pause/"
transcription_folder_path12 = "../Voices/Transcription/with pause"
csv_file_path12 = "processed/Michael_Noise_with_pause.csv"

In [31]:
folder_path13 = "../Voices/Male British (Oliver)/no pause/"
transcription_folder_path13 = "../Voices/Transcription/no pause"
csv_file_path13 = "processed/Oliver_no_pause.csv"

In [32]:
folder_path14 = "../Voices/Male British (Oliver)/with pause/"
transcription_folder_path14 = "../Voices/Transcription/with pause"
csv_file_path14 = "processed/Oliver_with_pause.csv"

In [33]:
folder_path15 = "../Voices/Male British (Oliver) with Noise/no pause/"
transcription_folder_path15 = "../Voices/Transcription/no pause"
csv_file_path15 = "processed/Oliver_Noise_no_pause.csv"

In [34]:
folder_path16 = "../Voices/Male British (Oliver) with Noise/with pause/"
transcription_folder_path16 = "../Voices/Transcription/with pause"
csv_file_path16 = "processed/Oliver_Noise_with_pause.csv"

In [35]:
import warnings

# Suppress the FP16 warning
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

In [36]:
process_files_and_save_to_csv_pandas(folder_path1,transcription_folder_path1, model, csv_file_path1)
process_files_and_save_to_csv_pandas(folder_path2,transcription_folder_path2, model, csv_file_path2)
process_files_and_save_to_csv_pandas(folder_path3,transcription_folder_path3, model, csv_file_path3)
process_files_and_save_to_csv_pandas(folder_path4,transcription_folder_path4, model, csv_file_path4)
process_files_and_save_to_csv_pandas(folder_path5,transcription_folder_path5, model, csv_file_path5)
process_files_and_save_to_csv_pandas(folder_path6,transcription_folder_path6, model, csv_file_path6)
process_files_and_save_to_csv_pandas(folder_path7,transcription_folder_path7, model, csv_file_path7)
process_files_and_save_to_csv_pandas(folder_path8,transcription_folder_path8, model, csv_file_path8)
process_files_and_save_to_csv_pandas(folder_path9,transcription_folder_path9, model, csv_file_path9)
process_files_and_save_to_csv_pandas(folder_path10,transcription_folder_path10, model, csv_file_path10)
process_files_and_save_to_csv_pandas(folder_path11,transcription_folder_path11, model, csv_file_path11)
process_files_and_save_to_csv_pandas(folder_path12,transcription_folder_path12, model, csv_file_path12)
process_files_and_save_to_csv_pandas(folder_path13,transcription_folder_path13, model, csv_file_path13)
process_files_and_save_to_csv_pandas(folder_path14,transcription_folder_path14, model, csv_file_path14)
process_files_and_save_to_csv_pandas(folder_path15,transcription_folder_path15, model, csv_file_path15)
process_files_and_save_to_csv_pandas(folder_path16,transcription_folder_path16, model, csv_file_path16)

Processing JSON files: 100%|██████████| 5/5 [00:27<00:00,  5.51s/file]
Processing JSON files: 100%|██████████| 5/5 [00:27<00:00,  5.44s/file]
Processing JSON files: 100%|██████████| 5/5 [00:24<00:00,  4.82s/file]
Processing JSON files: 100%|██████████| 5/5 [00:26<00:00,  5.21s/file]
Processing JSON files: 100%|██████████| 5/5 [00:24<00:00,  4.83s/file]
Processing JSON files: 100%|██████████| 5/5 [00:26<00:00,  5.36s/file]
Processing JSON files: 100%|██████████| 5/5 [00:25<00:00,  5.19s/file]
Processing JSON files: 100%|██████████| 5/5 [00:23<00:00,  4.74s/file]
Processing JSON files: 100%|██████████| 5/5 [00:24<00:00,  4.87s/file]
Processing JSON files: 100%|██████████| 5/5 [00:27<00:00,  5.55s/file]
Processing JSON files: 100%|██████████| 5/5 [00:23<00:00,  4.72s/file]
Processing JSON files: 100%|██████████| 5/5 [00:25<00:00,  5.05s/file]
Processing JSON files: 100%|██████████| 5/5 [00:23<00:00,  4.67s/file]
Processing JSON files: 100%|██████████| 5/5 [00:26<00:00,  5.22s/file]
Proces