In [1]:
import os
import json
import time
import csv
import whisper
import pandas as pd
from tqdm import tqdm

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import jiwer
from nltk.translate.bleu_score import sentence_bleu
import Levenshtein

Load Whisper model

In [4]:
def load_whisper_model(model_size="base"):
    return whisper.load_model(model_size)

Get list of files in the directory

In [5]:
def get_files_in_directory(folder_path, file_extension):
    return sorted([f for f in os.listdir(folder_path) if f.endswith(file_extension)])

Load transcription data from JSON file

In [6]:
def load_transcription_data(json_file_path):
    with open(json_file_path, 'r') as file:
        return json.load(file)

Get matching transcription from JSON

In [7]:
def get_matching_transcription(transcription_data, sample_name):
    return next((item['transcription'] for item in transcription_data if item['sample_name'] == sample_name), None)

Transcribe audio and measure time

In [8]:
def transcribe_audio_with_time(model, wav_file_path):
    start_time = time.time()
    result = model.transcribe(wav_file_path)
    end_time = time.time()
    transcription_time = end_time - start_time
    return result['text'], transcription_time

Calculate Word Error Rate (WER)

In [9]:
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

Calculate Character Error Rate (CER)

In [10]:
def calculate_cer(reference, hypothesis):
    return jiwer.cer(reference, hypothesis)

Calculate Sentence Error Rate (SER)

In [11]:
def calculate_ser(reference_sentences, hypothesis_sentences):
    incorrect_sentences = sum([ref != hyp for ref, hyp in zip(reference_sentences, hypothesis_sentences)])
    return incorrect_sentences / len(reference_sentences)

Calculate BLEU Score

In [12]:
def calculate_bleu(reference, hypothesis):
    reference = [reference.split()]
    hypothesis = hypothesis.split()
    return sentence_bleu(reference, hypothesis)

Calculate Levenshtein Distance

In [13]:
def calculate_levenshtein(reference, hypothesis):
    return Levenshtein.distance(reference, hypothesis)

Calculate Cosine Similarity

In [14]:
def calculate_cosine_similarity(reference, hypothesis):
    vectorizer = TfidfVectorizer().fit_transform([reference, hypothesis])
    vectors = vectorizer.toarray()
    cos_sim = cosine_similarity(vectors)[0, 1]
    return cos_sim

Function to write results to a CSV file using pandas

In [15]:
def write_results_to_csv_pandas(csv_file_path, results):
    df = pd.DataFrame(results, columns=["File", "Whisper Output", "Correct Transcription", "WER", "CER", "SER", "BLEU", "Levenshtein Distance", "Cosine Similarity", "Transcription Time"])
    df.to_csv(csv_file_path, index=False)

Process files and save the result to CSV with tqdm progress bar

In [16]:
def process_files_and_save_to_csv_pandas(folder_path, model, csv_file_path):
    # Get list of files
    wav_files = get_files_in_directory(folder_path, '.wav')
    json_files = get_files_in_directory(folder_path, '.json')
    
    results = []
    
    for json_file in tqdm(json_files, desc="Processing JSON files", unit="file"):  # Add progress bar here
        # Load the transcription data from JSON
        json_path = os.path.join(folder_path, json_file)
        transcription_data = load_transcription_data(json_path)
        
        base_name = json_file.replace("_transcription.json", "")
        
        for i in tqdm(range(1, 6), desc=f"Processing {json_file}", leave=False, unit="sample"):  # Inner loop progress bar
            wav_file = f"{base_name}_{i}.wav"
            wav_path = os.path.join(folder_path, wav_file)
            
            if not os.path.exists(wav_path):
                print(f"Audio file {wav_file} not found.")
                continue
            
            sample_name = f"sample_{i}"
            correct_transcription = get_matching_transcription(transcription_data, sample_name)
            
            if not correct_transcription:
                print(f"No matching transcription found for {wav_file} in {json_file}.")
                continue
            
            # Transcribe the audio and measure time
            whisper_output, transcription_time = transcribe_audio_with_time(model, wav_path)
            
            # Calculate WER, CER, SER, BLEU, Levenshtein Distance, and Cosine Similarity
            wer = calculate_wer(correct_transcription, whisper_output)
            cer = calculate_cer(correct_transcription, whisper_output)
            ser = calculate_ser([correct_transcription], [whisper_output])
            bleu = calculate_bleu(correct_transcription, whisper_output)
            levenshtein_distance = calculate_levenshtein(correct_transcription, whisper_output)
            cosine_sim = calculate_cosine_similarity(correct_transcription, whisper_output)
            
            # Store result in the list
            results.append([wav_file, whisper_output, correct_transcription, wer, cer, ser, bleu, levenshtein_distance, cosine_sim, transcription_time])
    
    # Write results to CSV using pandas
    write_results_to_csv_pandas(csv_file_path, results)

In [17]:
model = load_whisper_model("base")

  checkpoint = torch.load(fp, map_location=device)


In [24]:
folder_path1 = "../Voices/Male British (Oliver)"
csv_file_path1 = "data/Oliver_base.csv"

In [None]:
folder_path1 = "../Voices/Male British (Oliver)"
csv_file_path1 = "data/Oliver_base.csv"

In [25]:
folder_path2 = "../Voices/Female British (Madelyn)"
csv_file_path2 = "data/Madelyn_base.csv"

In [26]:
folder_path3 = "../Voices/Female American (Nava)"
csv_file_path3 = "data/Nava_base.csv"

In [27]:
folder_path4 = "../Voices/Male American (Michael)"
csv_file_path4 = "data/Michael_base.csv"

In [28]:
import warnings

# Suppress the FP16 warning
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

In [29]:
process_files_and_save_to_csv_pandas(folder_path1, model, csv_file_path1)
process_files_and_save_to_csv_pandas(folder_path2, model, csv_file_path2)
process_files_and_save_to_csv_pandas(folder_path3, model, csv_file_path3)
process_files_and_save_to_csv_pandas(folder_path4, model, csv_file_path4)

Processing JSON files: 100%|██████████| 5/5 [00:44<00:00,  8.89s/file]
Processing JSON files: 100%|██████████| 5/5 [00:44<00:00,  8.91s/file]
Processing JSON files: 100%|██████████| 5/5 [00:44<00:00,  8.99s/file]
Processing JSON files: 100%|██████████| 5/5 [00:44<00:00,  8.95s/file]


In [9]:
from pydub import AudioSegment
import numpy as np

# Load your audio file (e.g., clean audio)
audio = AudioSegment.from_wav("../Voices/Female American (Nava)/10_words_sample_1.wav")

# Simulate phone call effect by bandpass filtering (300Hz to 3400Hz)
phone_call_audio = audio.low_pass_filter(3400).high_pass_filter(300)

# Generate white noise
def generate_white_noise(duration_ms, sample_rate, volume_db=-50):  # Further reduced volume
    num_samples = int((duration_ms / 1000.0) * sample_rate)
    samples = np.random.normal(0, 1, num_samples)
    samples = samples * (10 ** (volume_db / 20))  # Set volume
    return AudioSegment(
        samples.tobytes(),
        frame_rate=sample_rate,
        sample_width=2,  # 16-bit audio
        channels=1       # Mono
    )

# Get properties of the original audio
sample_rate = phone_call_audio.frame_rate
duration_ms = len(phone_call_audio)

# Generate white noise for the same duration as the audio
noise = generate_white_noise(duration_ms, sample_rate)

# Apply a low-pass filter to the noise to soften it
noise = noise.low_pass_filter(3000)  # Remove harsh higher frequencies

# Mix noise with the phone call audio at a lower ratio for clarity
mix_ratio = 0.05  # Further reduce the mix ratio
noisy_audio = phone_call_audio.overlay(noise - 30 * (1 - mix_ratio), position=0)  # Further reduce noise volume

# Apply a high-pass filter to the noisy audio to enhance clarity
noisy_audio = noisy_audio.high_pass_filter(300)

# Export the noisy phone call audio
noisy_audio.export("noisy_phone_call6.wav", format="wav")


<_io.BufferedRandom name='noisy_phone_call6.wav'>