In [38]:
import deepspeech
import os
import json
import pandas as pd
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import wave

In [39]:
def load_deepspeech_model(model_path, scorer_path):
    model = deepspeech.Model(model_path)
    model.enableExternalScorer(scorer_path)
    return model

In [40]:
def get_files_in_directory(folder_path, file_extension, keyword=None):
    if keyword:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension) and keyword in f])
    else:
        return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])


In [41]:
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

In [42]:
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)


In [88]:
from pydub import AudioSegment
import numpy as np
import time
def transcribe_audio_with_time(model, wav_file_path):
    start_time = time.time()

    # Load the audio file using pydub
    audio = AudioSegment.from_wav(wav_file_path)
    
    # Convert to mono and set the sample rate to 16000 Hz
    audio = audio.set_frame_rate(16000).set_channels(1)

    # Convert the audio to a numpy array with int16 data type
    raw_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
    
    # DeepSpeech expects a bytes object
    buffer = raw_data.tobytes()
    
    # Use DeepSpeech model to perform transcription
    result = model.stt(buffer)
    
    end_time = time.time()
    transcription_time = end_time - start_time

    return result, transcription_time

In [89]:
def write_results_to_csv_pandas(csv_file_path, results):
    df = pd.DataFrame(results, columns=["File", "Whisper Output", "Correct Transcription", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)


In [90]:
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim


In [91]:
def process_files_and_save_to_csv_pandas(audio_folder_path, transcription_folder_path, model, csv_file_path):
    wav_files = get_files_in_directory(audio_folder_path, '.wav')
    json_files = get_files_in_directory(transcription_folder_path, '.json', keyword='_transcription')
    
    results = []
    
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):
        json_path = os.path.join(transcription_folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        is_with_pause = "_with_pause" in json_file
        base_name = json_file.replace("_transcription_with_pause", "").replace("_transcription", "").replace(".json", "")
        
        for i in tqdm(range(1, 6), desc=f"Processing {json_file}", leave=False, unit="sample"):
            if is_with_pause:
                wav_file = f"{base_name}_with_pause_{i}.wav"
            else:
                wav_file = f"{base_name}_{i}.wav"
            
            wav_path = os.path.join(audio_folder_path, wav_file)
            
            if not os.path.exists(wav_path):
                print(f"Audio file {wav_file} not found.")
                continue
            
            sample_name = f"sample_{i}"
            correct_transcription = get_matching_transcription(transcription_data, sample_name)
            
            if not correct_transcription:
                print(f"No matching transcription found for {wav_file} in {json_file}.")
                continue
            
            # Transcribe the audio and measure time using DeepSpeech
            whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path)
            
            # Calculate cosine similarity
            cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
            
            # Store result
            results.append([wav_file, whisper_output, correct_transcription, cosine_sim, transcription_time])
    
    # Write results to CSV
    write_results_to_csv_pandas(csv_file_path, results)


In [92]:
folder_path1 = "../Voices/Female American (Nova)/no pause"
transcription_folder_path1 = "../Voices/Transcription/no pause"
csv_file_path1 = "processed/Nova_no_pause.csv"

In [93]:
folder_path2 = "../Voices/Female American (Nova)/with pause/"
transcription_folder_path2 = "../Voices/Transcription/with pause"
csv_file_path2 = "processed/Nova_with_pause.csv"

In [94]:
folder_path3 = "../Voices/Female American (Nova) with Noise/no pause/"
transcription_folder_path3 = "../Voices/Transcription/no pause"
csv_file_path3 = "processed/Nova_Noise_no_pause.csv"

In [95]:
folder_path4 = "../Voices/Female American (Nova) with Noise/with pause/"
transcription_folder_path4 = "../Voices/Transcription/with pause"
csv_file_path4 = "processed/Nova_Noise_with_pause.csv"

In [96]:
folder_path5 = "../Voices/Female British (Madelyn)/no pause/"
transcription_folder_path5 = "../Voices/Transcription/no pause"
csv_file_path5 = "processed/Madelyn_no_pause.csv"

In [97]:
folder_path6 = "../Voices/Female British (Madelyn)/with pause/"
transcription_folder_path6 = "../Voices/Transcription/with pause"
csv_file_path6 = "processed/Madelyn_with_pause.csv"

In [98]:
folder_path7 = "../Voices/Female British (Madelyn) with Noise/with pause/"
transcription_folder_path7 = "../Voices/Transcription/with pause"
csv_file_path7 = "processed/Madelyn_Noise_with_pause.csv"

In [99]:
folder_path8 = "../Voices/Female British (Madelyn) with Noise/no pause/"
transcription_folder_path8 = "../Voices/Transcription/no pause"
csv_file_path8 = "processed/Madelyn_Noise_no_pause.csv"

In [100]:
folder_path9 = "../Voices/Male American (Michael)/no pause/"
transcription_folder_path9 = "../Voices/Transcription/no pause"
csv_file_path9 = "processed/Michael_no_pause.csv"

In [101]:
folder_path10 = "../Voices/Male American (Michael)/with pause/"
transcription_folder_path10 = "../Voices/Transcription/with pause"
csv_file_path10 = "processed/Michael_with_pause.csv"

In [102]:
folder_path11 = "../Voices/Male American (Michael) with Noise/no pause/"
transcription_folder_path11 = "../Voices/Transcription/no pause"
csv_file_path11 = "processed/Michael_Noise_no_pause.csv"

In [103]:
folder_path12 = "../Voices/Male American (Michael) with Noise/with pause/"
transcription_folder_path12 = "../Voices/Transcription/with pause"
csv_file_path12 = "processed/Michael_Noise_with_pause.csv"

In [104]:
folder_path13 = "../Voices/Male British (Oliver)/no pause/"
transcription_folder_path13 = "../Voices/Transcription/no pause"
csv_file_path13 = "processed/Oliver_no_pause.csv"

In [105]:
folder_path14 = "../Voices/Male British (Oliver)/with pause/"
transcription_folder_path14 = "../Voices/Transcription/with pause"
csv_file_path14 = "processed/Oliver_with_pause.csv"

In [106]:
folder_path15 = "../Voices/Male British (Oliver) with Noise/no pause/"
transcription_folder_path15 = "../Voices/Transcription/no pause"
csv_file_path15 = "processed/Oliver_Noise_no_pause.csv"

In [107]:
folder_path16 = "../Voices/Male British (Oliver) with Noise/with pause/"
transcription_folder_path16 = "../Voices/Transcription/with pause"
csv_file_path16 = "processed/Oliver_Noise_with_pause.csv"

In [108]:
model_path = "../../../deepspeech-0.9.3-models.pbmm"
scorer_path = "../../../deepspeech-0.9.3-models.scorer"
model = load_deepspeech_model(model_path, scorer_path)

In [109]:
process_files_and_save_to_csv_pandas(folder_path1,transcription_folder_path1, model, csv_file_path1)
process_files_and_save_to_csv_pandas(folder_path2,transcription_folder_path2, model, csv_file_path2)
process_files_and_save_to_csv_pandas(folder_path3,transcription_folder_path3, model, csv_file_path3)
process_files_and_save_to_csv_pandas(folder_path4,transcription_folder_path4, model, csv_file_path4)
process_files_and_save_to_csv_pandas(folder_path5,transcription_folder_path5, model, csv_file_path5)
process_files_and_save_to_csv_pandas(folder_path6,transcription_folder_path6, model, csv_file_path6)
process_files_and_save_to_csv_pandas(folder_path7,transcription_folder_path7, model, csv_file_path7)
process_files_and_save_to_csv_pandas(folder_path8,transcription_folder_path8, model, csv_file_path8)
process_files_and_save_to_csv_pandas(folder_path9,transcription_folder_path9, model, csv_file_path9)
process_files_and_save_to_csv_pandas(folder_path10,transcription_folder_path10, model, csv_file_path10)
process_files_and_save_to_csv_pandas(folder_path11,transcription_folder_path11, model, csv_file_path11)
process_files_and_save_to_csv_pandas(folder_path12,transcription_folder_path12, model, csv_file_path12)
process_files_and_save_to_csv_pandas(folder_path13,transcription_folder_path13, model, csv_file_path13)
process_files_and_save_to_csv_pandas(folder_path14,transcription_folder_path14, model, csv_file_path14)
process_files_and_save_to_csv_pandas(folder_path15,transcription_folder_path15, model, csv_file_path15)
process_files_and_save_to_csv_pandas(folder_path16,transcription_folder_path16, model, csv_file_path16)

Processing JSON files:   0%|          | 0/5 [00:00<?, ?file/s]


ValueError: invalid literal for int() with base 10: b'\x0c\x00\x0b\x00\x0c\x00\n\x00\x0b\x00\x08\x00\x06\x00\x08\x00\x08\x00\x07\x00\x06\x00\x08\x00\t\x00\n\x00\n\x00\n\x00\x08\x00\x08\x00\x08\x00\n\x00\x0c\x00\r\x00\x0b\x00\n\x00\x0b\x00\n\x00\x0b\x00