In [2]:
import whisper
import json
import moviepy.editor as mp
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("test_dataset.json") as file:
    ground_truth = json.load(file)

video_paths = list(ground_truth.keys())

In [None]:
# load model
model = whisper.load_model('medium')

for video in video_paths:
    # store ground_truth data in variable
    info = ground_truth.get(video)
    truth = info.get('language')
    print(truth)

    # load video and split audio
    audio_file = f'{os.path.basename(video)}.wav'
    clip = mp.VideoFileClip(video)
    clip.audio.write_audiofile(audio_file)

    # load audio and detect language
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    _, probs = model.detect_language(mel)
    language = max(probs, key=probs.get)
    print(language)
    print(probs.get(language))

    # add data to dictionary
    ground_truth[video]['detected_language'] = language
    ground_truth[video]['confidence'] = probs.get(language)

    if truth == language:
        ground_truth[video]['correct'] = True
    else:
        ground_truth[video]['correct'] = False

    # remove audio file
    os.remove(audio_file)

In [13]:
# find average confidence of all failed detections
confidence_scores = []

for video, data in ground_truth.items():
    if data['correct'] == False:
        confidence_scores.append(data['confidence'])

confidence_scores


[0.13250073790550232,
 0.5220446586608887,
 0.5541367530822754,
 0.5270262956619263,
 0.21682201325893402]

In [6]:
# export to json
with open("results.json", "w+") as file:
    json.dump(ground_truth, file)