In [15]:
import os
import time
import json
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from vosk import Model, KaldiRecognizer
import wave

In [16]:
def load_vosk_model(model_path):
    return Model(model_path)

In [17]:
# Get files in directory based on extension and optional keyword
def get_files_in_directory(folder_path, file_extension, keyword=None):
    if keyword:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension) and keyword in f])
    else:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

In [18]:
# Load transcription data from JSON
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)


In [19]:
# Get matching transcription from transcription data
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)


In [20]:
# Function to transcribe audio using Vosk and measure time
def transcribe_audio_with_time(vosk_model, wav_file_path):
    wf = wave.open(wav_file_path, "rb")
    
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print(f"Audio file {wav_file_path} must be WAV format mono PCM.")
        return None, 0

    rec = KaldiRecognizer(vosk_model, wf.getframerate())
    
    start_time = time.time()
    transcription_text = ""
    
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            transcription_result = rec.Result()
            transcription_text += json.loads(transcription_result)["text"]
    
    wf.close()
    end_time = time.time()
    transcription_time = end_time - start_time
    
    return transcription_text, transcription_time


In [21]:
# Write results to CSV
def write_results_to_csv_pandas(csv_file_path, results):
    df = pd.DataFrame(results, columns=["File", "Vosk Output", "Correct Transcription", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)


In [22]:
# Calculate cosine similarity between reference and hypothesis
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim


In [23]:
# Process files and save results to CSV
def process_files_and_save_to_csv_pandas(audio_folder_path, transcription_folder_path, vosk_model, csv_file_path):
    # Get list of audio files and transcription JSON files from their respective directories
    wav_files = get_files_in_directory(audio_folder_path, '.wav')
    json_files = get_files_in_directory(transcription_folder_path, '.json', keyword='_transcription')
    
    results = []
    
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):
        # Load the transcription data from JSON
        json_path = os.path.join(transcription_folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        
        # Determine if the file is "with pause" or "no pause"
        is_with_pause = "_with_pause" in json_file
        
        # Create the base name based on the transcription file
        base_name = json_file.replace("_transcription_with_pause", "").replace("_transcription", "").replace(".json", "")
        
        for i in tqdm(range(1, 6), desc=f"Processing {json_file}", leave=False, unit="sample"):
            # Construct the expected audio file name
            if is_with_pause:
                wav_file = f"{base_name}_with_pause_{i}.wav"
            else:
                wav_file = f"{base_name}_{i}.wav"
            
            wav_path = os.path.join(audio_folder_path, wav_file)
            
            if not os.path.exists(wav_path):
                print(f"Audio file {wav_file} not found.")
                continue
            
            sample_name = f"sample_{i}"
            correct_transcription = get_matching_transcription(transcription_data, sample_name)
            
            if not correct_transcription:
                print(f"No matching transcription found for {wav_file} in {json_file}.")
                continue
            
            # Transcribe the audio and measure time
            vosk_output, transcription_time = transcribe_audio_with_time(vosk_model, wav_path)
            
            if vosk_output is None:
                continue
            
            # Calculate cosine similarity
            cosine_sim = calculate_cosine_similarity(correct_transcription, vosk_output)
            
            # Store result in the list
            results.append([wav_file, vosk_output, correct_transcription, cosine_sim, transcription_time])
    
    # Write results to CSV using pandas
    write_results_to_csv_pandas(csv_file_path, results)

In [24]:
vosk_model = load_vosk_model("../../../vosk-model-en-us-0.42-gigaspeech/") 

In [25]:
folder_path1 = "../Voices/Female American (Nova)/no pause"
transcription_folder_path1 = "../Voices/Transcription/no pause"
csv_file_path1 = "processed/Nova_no_pause.csv"

In [26]:
folder_path2 = "../Voices/Female American (Nova)/with pause/"
transcription_folder_path2 = "../Voices/Transcription/with pause"
csv_file_path2 = "processed/Nova_with_pause.csv"

In [27]:
folder_path3 = "../Voices/Female American (Nova) with Noise/no pause/"
transcription_folder_path3 = "../Voices/Transcription/no pause"
csv_file_path3 = "processed/Nova_Noise_no_pause.csv"

In [28]:
folder_path4 = "../Voices/Female American (Nova) with Noise/with pause/"
transcription_folder_path4 = "../Voices/Transcription/with pause"
csv_file_path4 = "processed/Nova_Noise_with_pause.csv"

In [29]:
folder_path5 = "../Voices/Female British (Madelyn)/no pause/"
transcription_folder_path5 = "../Voices/Transcription/no pause"
csv_file_path5 = "processed/Madelyn_no_pause.csv"

In [30]:
folder_path6 = "../Voices/Female British (Madelyn)/with pause/"
transcription_folder_path6 = "../Voices/Transcription/with pause"
csv_file_path6 = "processed/Madelyn_with_pause.csv"

In [31]:
folder_path7 = "../Voices/Female British (Madelyn) with Noise/with pause/"
transcription_folder_path7 = "../Voices/Transcription/with pause"
csv_file_path7 = "processed/Madelyn_Noise_with_pause.csv"

In [32]:
folder_path8 = "../Voices/Female British (Madelyn) with Noise/no pause/"
transcription_folder_path8 = "../Voices/Transcription/no pause"
csv_file_path8 = "processed/Madelyn_Noise_no_pause.csv"

In [33]:
folder_path9 = "../Voices/Male American (Michael)/no pause/"
transcription_folder_path9 = "../Voices/Transcription/no pause"
csv_file_path9 = "processed/Michael_no_pause.csv"

In [34]:
folder_path10 = "../Voices/Male American (Michael)/with pause/"
transcription_folder_path10 = "../Voices/Transcription/with pause"
csv_file_path10 = "processed/Michael_with_pause.csv"

In [35]:
folder_path11 = "../Voices/Male American (Michael) with Noise/no pause/"
transcription_folder_path11 = "../Voices/Transcription/no pause"
csv_file_path11 = "processed/Michael_Noise_no_pause.csv"

In [36]:
folder_path12 = "../Voices/Male American (Michael) with Noise/with pause/"
transcription_folder_path12 = "../Voices/Transcription/with pause"
csv_file_path12 = "processed/Michael_Noise_with_pause.csv"

In [37]:
folder_path13 = "../Voices/Male British (Oliver)/no pause/"
transcription_folder_path13 = "../Voices/Transcription/no pause"
csv_file_path13 = "processed/Oliver_no_pause.csv"

In [38]:
folder_path14 = "../Voices/Male British (Oliver)/with pause/"
transcription_folder_path14 = "../Voices/Transcription/with pause"
csv_file_path14 = "processed/Oliver_with_pause.csv"

In [39]:
folder_path15 = "../Voices/Male British (Oliver) with Noise/no pause/"
transcription_folder_path15 = "../Voices/Transcription/no pause"
csv_file_path15 = "processed/Oliver_Noise_no_pause.csv"

In [40]:
folder_path16 = "../Voices/Male British (Oliver) with Noise/with pause/"
transcription_folder_path16 = "../Voices/Transcription/with pause"
csv_file_path16 = "processed/Oliver_Noise_with_pause.csv"

In [41]:
import warnings

# Suppress the FP16 warning
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

In [43]:
process_files_and_save_to_csv_pandas(folder_path1,transcription_folder_path1, vosk_model, csv_file_path1)
process_files_and_save_to_csv_pandas(folder_path2,transcription_folder_path2, vosk_model, csv_file_path2)
process_files_and_save_to_csv_pandas(folder_path3,transcription_folder_path3, vosk_model, csv_file_path3)
process_files_and_save_to_csv_pandas(folder_path4,transcription_folder_path4, vosk_model, csv_file_path4)
process_files_and_save_to_csv_pandas(folder_path5,transcription_folder_path5, vosk_model, csv_file_path5)
process_files_and_save_to_csv_pandas(folder_path6,transcription_folder_path6, vosk_model, csv_file_path6)
process_files_and_save_to_csv_pandas(folder_path7,transcription_folder_path7, vosk_model, csv_file_path7)
process_files_and_save_to_csv_pandas(folder_path8,transcription_folder_path8, vosk_model, csv_file_path8)
process_files_and_save_to_csv_pandas(folder_path9,transcription_folder_path9, vosk_model, csv_file_path9)
process_files_and_save_to_csv_pandas(folder_path10,transcription_folder_path10, vosk_model, csv_file_path10)
process_files_and_save_to_csv_pandas(folder_path11,transcription_folder_path11, vosk_model, csv_file_path11)
process_files_and_save_to_csv_pandas(folder_path12,transcription_folder_path12, vosk_model, csv_file_path12)
process_files_and_save_to_csv_pandas(folder_path13,transcription_folder_path13, vosk_model, csv_file_path13)
process_files_and_save_to_csv_pandas(folder_path14,transcription_folder_path14, vosk_model, csv_file_path14)
process_files_and_save_to_csv_pandas(folder_path15,transcription_folder_path15, vosk_model, csv_file_path15)
process_files_and_save_to_csv_pandas(folder_path16,transcription_folder_path16, vosk_model, csv_file_path16)

Processing JSON files: 100%|██████████| 5/5 [01:40<00:00, 20.07s/file]
Processing JSON files: 100%|██████████| 5/5 [02:32<00:00, 30.60s/file]
Processing JSON files: 100%|██████████| 5/5 [01:42<00:00, 20.46s/file]
Processing JSON files: 100%|██████████| 5/5 [02:38<00:00, 31.73s/file]
Processing JSON files: 100%|██████████| 5/5 [01:46<00:00, 21.32s/file]
Processing JSON files: 100%|██████████| 5/5 [02:19<00:00, 27.92s/file]
Processing JSON files: 100%|██████████| 5/5 [02:26<00:00, 29.38s/file]
Processing JSON files: 100%|██████████| 5/5 [01:47<00:00, 21.46s/file]
Processing JSON files: 100%|██████████| 5/5 [01:34<00:00, 18.90s/file]
Processing JSON files: 100%|██████████| 5/5 [02:22<00:00, 28.53s/file]
Processing JSON files: 100%|██████████| 5/5 [01:35<00:00, 19.15s/file]
Processing JSON files: 100%|██████████| 5/5 [02:29<00:00, 29.83s/file]
Processing JSON files: 100%|██████████| 5/5 [01:52<00:00, 22.43s/file]
Processing JSON files: 100%|██████████| 5/5 [02:26<00:00, 29.32s/file]
Proces