In [16]:
import os
import json
import time
import csv
import whisper
import pandas as pd
from tqdm import tqdm

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
def load_whisper_model(model_size="base"):
    return whisper.load_model(model_size)

In [19]:
def get_files_in_directory(folder_path, file_extension):
    return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

In [20]:
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

In [21]:
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)

In [22]:
def transcribe_audio_with_time(model, wav_file_path):
    start_time = time.time()
    result = model.transcribe(wav_file_path)
    end_time = time.time()
    transcription_time = end_time - start_time
    return result['text'], transcription_time

In [23]:
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim

In [24]:
def write_results_to_csv_pandas(csv_file_path, results):
    df = pd.DataFrame(results, columns=["File", "Whisper Output", "Correct Transcription", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)

In [64]:
def process_files_and_save_to_csv_pandas(audio_folder_path, transcription_folder_path, model, csv_file_path):
    # Ensure the directory exists for saving CSV
    os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

    # Get list of audio files and transcription JSON files
    wav_files = get_files_in_directory(audio_folder_path, '.wav')
    json_files = get_files_in_directory(transcription_folder_path, '.json')
    
    results = []

    # Process each transcription JSON file
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):
        # Load the transcription data from JSON
        json_path = os.path.join(transcription_folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        
        # Extract base name to match with audio files
        base_name = json_file.split("_transcription")[0]  # e.g., "10_words_sample"

        # Check for 5 samples (both normal and with pause)
        for i in range(1, 6):  # Expecting 5 samples
            # Construct the expected WAV filename for both with and without pause
            wav_file_normal = f"{base_name}_{i}.wav"  # e.g., "10_words_sample_1.wav"
            wav_file_with_pause = f"{base_name}_with pause_{i}.wav"  # e.g., "10_words_sample_with pause_1.wav"

            # Process the normal audio file
            wav_path = os.path.join(audio_folder_path, wav_file_normal)
            if os.path.exists(wav_path):
                sample_name = f"sample_{i}"
                correct_transcription = get_matching_transcription(transcription_data, sample_name)
                if correct_transcription:
                    # Transcribe the audio and measure time
                    whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path)
                    cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
                    results.append([wav_file_normal, whisper_output, correct_transcription, cosine_sim, transcription_time])
                    print(f"Processed {wav_file_normal}: Cosine Similarity = {cosine_sim}, Transcription Time = {transcription_time}")
                else:
                    print(f"No matching transcription found for {wav_file_normal} in {json_file}.")

            # Process the audio file with pause
            wav_path_with_pause = os.path.join(audio_folder_path, wav_file_with_pause)
            if os.path.exists(wav_path_with_pause):
                sample_name = f"sample_{i}"
                correct_transcription = get_matching_transcription(transcription_data, sample_name)
                if correct_transcription:
                    # Transcribe the audio and measure time
                    whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path_with_pause)
                    cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
                    results.append([wav_file_with_pause, whisper_output, correct_transcription, cosine_sim, transcription_time])
                    print(f"Processed {wav_file_with_pause}: Cosine Similarity = {cosine_sim}, Transcription Time = {transcription_time}")
                else:
                    print(f"No matching transcription found for {wav_file_with_pause} in {json_file}.")

    # Check if results were collected before writing to CSV
    if results:
        write_results_to_csv_pandas(csv_file_path, results)
        print(f"Results written to {csv_file_path}")
    else:
        print("No results to write to CSV.")

In [53]:
model = load_whisper_model("base")

  checkpoint = torch.load(fp, map_location=device)


In [65]:
audio_folder_path = "../Voices/Male British (Oliver)"
transcription_folder_path = "../Voices/Transcription" 
csv_file_path1 = "./proccessed/Oliver_base.csv"

In [66]:
import warnings
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

In [67]:
process_files_and_save_to_csv_pandas(audio_folder_path, transcription_folder_path, model, csv_file_path1)

Processing JSON files:   0%|          | 0/10 [00:00<?, ?file/s]

Processed 10_words_sample_1.wav: Cosine Similarity = 0.9999999999999999, Transcription Time = 1.4900169372558594
Processed 10_words_sample_with pause_1.wav: Cosine Similarity = 0.2716186451929929, Transcription Time = 1.7792437076568604
Processed 10_words_sample_2.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 1.511958360671997
Processed 10_words_sample_with pause_2.wav: Cosine Similarity = 0.37491427238345554, Transcription Time = 1.534898281097412
Processed 10_words_sample_3.wav: Cosine Similarity = 0.9999999999999999, Transcription Time = 1.5458686351776123
Processed 10_words_sample_with pause_3.wav: Cosine Similarity = 0.22576484600261607, Transcription Time = 1.7553083896636963
Processed 10_words_sample_4.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 1.583765983581543
Processed 10_words_sample_with pause_4.wav: Cosine Similarity = 0.11914719328677731, Transcription Time = 1.6186738014221191
Processed 10_words_sample_5.wav: Cosine Similarity =

Processing JSON files:  10%|█         | 1/10 [00:16<02:25, 16.12s/file]

Processed 10_words_sample_with pause_5.wav: Cosine Similarity = 0.1450404822587595, Transcription Time = 1.649301528930664
Processed 10_words_sample_1.wav: Cosine Similarity = 0.26055567105626243, Transcription Time = 1.5129544734954834
Processed 10_words_sample_with pause_1.wav: Cosine Similarity = 0.9361362641191208, Transcription Time = 1.6535861492156982
Processed 10_words_sample_2.wav: Cosine Similarity = 0.3575475209147722, Transcription Time = 1.5319039821624756
Processed 10_words_sample_with pause_2.wav: Cosine Similarity = 0.9361362641191208, Transcription Time = 1.5967309474945068
Processed 10_words_sample_3.wav: Cosine Similarity = 0.21528271546341884, Transcription Time = 1.636624813079834
Processed 10_words_sample_with pause_3.wav: Cosine Similarity = 0.9266368211041915, Transcription Time = 1.7393519878387451
Processed 10_words_sample_4.wav: Cosine Similarity = 0.11304078416613478, Transcription Time = 1.5628225803375244
Processed 10_words_sample_with pause_4.wav: Cosine 

Processing JSON files:  20%|██        | 2/10 [00:32<02:08, 16.11s/file]

Processed 10_words_sample_with pause_5.wav: Cosine Similarity = 0.8586557982260813, Transcription Time = 1.6685404777526855
Processed 20_words_sample_1.wav: Cosine Similarity = 1.0000000000000004, Transcription Time = 1.9248547554016113
Processed 20_words_sample_with pause_1.wav: Cosine Similarity = 0.9101113851286252, Transcription Time = 1.8969297409057617
Processed 20_words_sample_2.wav: Cosine Similarity = 1.0, Transcription Time = 1.8171415328979492
Processed 20_words_sample_with pause_2.wav: Cosine Similarity = 0.9596398661682171, Transcription Time = 1.9388184547424316
Processed 20_words_sample_3.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 1.8580331802368164
Processed 20_words_sample_with pause_3.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 1.8969309329986572
Processed 20_words_sample_4.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 1.8460652828216553
Processed 20_words_sample_with pause_4.wav: Cosine Similarity = 0.9

Processing JSON files:  30%|███       | 3/10 [00:51<02:03, 17.63s/file]

Processed 20_words_sample_with pause_5.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.006636619567871
Processed 20_words_sample_1.wav: Cosine Similarity = 0.8379045376520945, Transcription Time = 1.9936721324920654
Processed 20_words_sample_with pause_1.wav: Cosine Similarity = 0.9174208172605055, Transcription Time = 2.0774478912353516
Processed 20_words_sample_2.wav: Cosine Similarity = 0.9238026005035378, Transcription Time = 1.836092233657837
Processed 20_words_sample_with pause_2.wav: Cosine Similarity = 0.8865178039130657, Transcription Time = 1.858034610748291
Processed 20_words_sample_3.wav: Cosine Similarity = 0.9207401897413288, Transcription Time = 1.9198682308197021
Processed 20_words_sample_with pause_3.wav: Cosine Similarity = 0.9207401897413288, Transcription Time = 1.9258537292480469
Processed 20_words_sample_4.wav: Cosine Similarity = 0.9207401897413279, Transcription Time = 1.8690059185028076
Processed 20_words_sample_with pause_4.wav: Cosine Simi

Processing JSON files:  40%|████      | 4/10 [01:10<01:49, 18.23s/file]

Processed 20_words_sample_with pause_5.wav: Cosine Similarity = 0.9434557355986908, Transcription Time = 1.8779809474945068
Processed 30_words_sample_1.wav: Cosine Similarity = 0.9999999999999997, Transcription Time = 2.1422829627990723
Processed 30_words_sample_with pause_1.wav: Cosine Similarity = 0.9026283183113566, Transcription Time = 2.430504322052002
Processed 30_words_sample_2.wav: Cosine Similarity = 0.9999999999999994, Transcription Time = 2.194136619567871
Processed 30_words_sample_with pause_2.wav: Cosine Similarity = 0.8138522919055411, Transcription Time = 2.2051050662994385
Processed 30_words_sample_3.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.222059488296509
Processed 30_words_sample_with pause_3.wav: Cosine Similarity = 0.9791966537124523, Transcription Time = 2.2689363956451416
Processed 30_words_sample_4.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.2459967136383057
Processed 30_words_sample_with pause_4.wav: Cosine Simi

Processing JSON files:  50%|█████     | 5/10 [01:33<01:38, 19.69s/file]

Processed 30_words_sample_with pause_5.wav: Cosine Similarity = 0.931461358465165, Transcription Time = 2.244001865386963
Processed 30_words_sample_1.wav: Cosine Similarity = 0.8845737538593996, Transcription Time = 2.1951324939727783
Processed 30_words_sample_with pause_1.wav: Cosine Similarity = 0.935751204688267, Transcription Time = 2.4045724868774414
Processed 30_words_sample_2.wav: Cosine Similarity = 0.781535322530404, Transcription Time = 2.1552395820617676
Processed 30_words_sample_with pause_2.wav: Cosine Similarity = 0.9529000740006429, Transcription Time = 2.2709298133850098
Processed 30_words_sample_3.wav: Cosine Similarity = 0.8917020257670603, Transcription Time = 2.2200655937194824
Processed 30_words_sample_with pause_3.wav: Cosine Similarity = 0.9090376755159724, Transcription Time = 2.2420077323913574
Processed 30_words_sample_4.wav: Cosine Similarity = 0.832984165050738, Transcription Time = 2.3099260330200195
Processed 30_words_sample_with pause_4.wav: Cosine Simila

Processing JSON files:  60%|██████    | 6/10 [01:55<01:22, 20.64s/file]

Processed 30_words_sample_with pause_5.wav: Cosine Similarity = 0.9604483248626403, Transcription Time = 2.2589645385742188
Processed 40_words_sample_1.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.4324982166290283
Processed 40_words_sample_with pause_1.wav: Cosine Similarity = 0.9755983610715643, Transcription Time = 2.4055705070495605
Processed 40_words_sample_2.wav: Cosine Similarity = 1.0000000000000007, Transcription Time = 2.326782464981079
Processed 40_words_sample_with pause_2.wav: Cosine Similarity = 0.9390904520133413, Transcription Time = 2.55617094039917
Processed 40_words_sample_3.wav: Cosine Similarity = 0.8971762009003741, Transcription Time = 2.2703371047973633
Processed 40_words_sample_with pause_3.wav: Cosine Similarity = 0.9299285686345931, Transcription Time = 2.3168065547943115
Processed 40_words_sample_4.wav: Cosine Similarity = 0.9999999999999992, Transcription Time = 2.2948665618896484
Processed 40_words_sample_with pause_4.wav: Cosine Simi

Processing JSON files:  70%|███████   | 7/10 [02:19<01:05, 21.73s/file]

Processed 40_words_sample_with pause_5.wav: Cosine Similarity = 0.8975380051197426, Transcription Time = 2.4863531589508057
Processed 40_words_sample_1.wav: Cosine Similarity = 0.875746306137646, Transcription Time = 2.4305057525634766
Processed 40_words_sample_with pause_1.wav: Cosine Similarity = 0.8955324150715732, Transcription Time = 2.5192666053771973
Processed 40_words_sample_2.wav: Cosine Similarity = 0.8874159077294704, Transcription Time = 2.3267805576324463
Processed 40_words_sample_with pause_2.wav: Cosine Similarity = 0.9143057862384296, Transcription Time = 2.564147472381592
Processed 40_words_sample_3.wav: Cosine Similarity = 0.7795474421543919, Transcription Time = 2.2370200157165527
Processed 40_words_sample_with pause_3.wav: Cosine Similarity = 0.9219942757300631, Transcription Time = 2.441474676132202
Processed 40_words_sample_4.wav: Cosine Similarity = 0.8249305762475405, Transcription Time = 2.2739222049713135
Processed 40_words_sample_with pause_4.wav: Cosine Simi

Processing JSON files:  80%|████████  | 8/10 [02:43<00:44, 22.50s/file]

Processed 40_words_sample_with pause_5.wav: Cosine Similarity = 0.9032225618759107, Transcription Time = 2.4494540691375732
Processed 50_words_sample_1.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.559157133102417
Processed 50_words_sample_with pause_1.wav: Cosine Similarity = 0.9800376809663689, Transcription Time = 2.6604340076446533
Processed 50_words_sample_2.wav: Cosine Similarity = 1.0000000000000007, Transcription Time = 2.4424710273742676
Processed 50_words_sample_with pause_2.wav: Cosine Similarity = 0.9540892173387038, Transcription Time = 2.554109811782837
Processed 50_words_sample_3.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.509294033050537
Processed 50_words_sample_with pause_3.wav: Cosine Similarity = 0.9236502692895245, Transcription Time = 2.5920727252960205
Processed 50_words_sample_4.wav: Cosine Similarity = 1.0000000000000002, Transcription Time = 2.394598960876465
Processed 50_words_sample_with pause_4.wav: Cosine Simil

Processing JSON files:  90%|█████████ | 9/10 [03:09<00:23, 23.44s/file]

Processed 50_words_sample_with pause_5.wav: Cosine Similarity = 0.7381335090196928, Transcription Time = 2.7227234840393066
Processed 50_words_sample_1.wav: Cosine Similarity = 0.8542408211833338, Transcription Time = 2.5272490978240967
Processed 50_words_sample_with pause_1.wav: Cosine Similarity = 0.8695641616041424, Transcription Time = 2.5671403408050537
Processed 50_words_sample_2.wav: Cosine Similarity = 0.8408180853367483, Transcription Time = 2.4245195388793945
Processed 50_words_sample_with pause_2.wav: Cosine Similarity = 0.8776782349316483, Transcription Time = 2.5551693439483643
Processed 50_words_sample_3.wav: Cosine Similarity = 0.8619161172033042, Transcription Time = 2.526247978210449
Processed 50_words_sample_with pause_3.wav: Cosine Similarity = 0.8830874083542424, Transcription Time = 2.6758480072021484
Processed 50_words_sample_4.wav: Cosine Similarity = 0.8929821067965046, Transcription Time = 2.436497449874878
Processed 50_words_sample_with pause_4.wav: Cosine Sim

Processing JSON files: 100%|██████████| 10/10 [03:34<00:00, 21.48s/file]

Processed 50_words_sample_with pause_5.wav: Cosine Similarity = 0.7032825996613278, Transcription Time = 2.808494806289673
Results written to ./proccessed/Oliver_base.csv



