In [2]:
import jsonlines
import torchaudio
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, GenerationConfig
from pathlib import Path
import torch
import librosa
import IPython.display as ipd

In [3]:
def get_suppressed_tokens(tokenizer):
    # if keep_numbers_string:
    #     number_tokens = [i for i in range(whisper_processor.tokenizer.vocab_size) if all(c in "0123456789$£&\"\'" for c in whisper_processor.tokenizer.decode([i]).removeprefix(" "))]
    tokens_file_path = Path("./cache_tokens")
    bad_tokens_str = [r"0",
                      r"1",
                      r"2",
                      r"3",
                      r"4",
                      r"5",
                      r"6",
                      r"7",
                      r"8",
                      r"9",
                      "$",
                      "£", ]
                      # "&",
                      # "\"",
                      # r"...",
                      # "..",
                      # "#",
                      # "№",
                      # "@",
                      # "*",
                      # "--"]
    # if tokens_file_path.exists():
    #     pass
    #     #tokens_dict = deserialize_data(tokens_file_path)
    # else:
    tokens_dict = dict()

    all_tokens_cached = True

    for token_num in range(tokenizer.vocab_size):
        token_str = tokenizer.decode([token_num]).removeprefix(" ")
        for bad_token in bad_tokens_str:
            if bad_token not in tokens_dict:
                tokens_dict[bad_token] = [False, [], []]
                all_tokens_cached = False

            if not tokens_dict[bad_token][0]:
                if bad_token in token_str:
                    tokens_dict[bad_token][1].append(token_num)
                    tokens_dict[bad_token][2].append(token_str)
        if all_tokens_cached:
            break

    result = []
    for k, v in tokens_dict.items():
        tokens_dict[k][0] = True
        if k in bad_tokens_str:
            result += v[1]
    #serialize_data(tokens_dict, tokens_file_path)
    return list(set(result))


In [4]:
model_id = "openai/whisper-medium.en"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)

processor = AutoProcessor.from_pretrained(model_id)
 
    
generation_config = GenerationConfig.from_pretrained(model_id)
generation_config.suppress_tokens += get_suppressed_tokens(processor.tokenizer)
generation_config.suppress_tokens = list(set(generation_config.suppress_tokens))
generation_config.suppress_tokens.sort()

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

audio_file = '/home/jupyter/advanced/audio/audio_1670.wav'
audio_input, sample_rate = librosa.load(audio_file, sr=16000)

transcription = pipe(audio_file, generate_kwargs={"generation_config": generation_config})

print("Transcription:", transcription['text'])

Transcription:  Control here, surface to air missile heading zero six zero, engage the silver and orange camouflage helicopter. That's zero six zero, over.


In [5]:
audio_data, sampling_rate = librosa.load(audio_file, sr=None)
waveform, sample_rate = torchaudio.load(audio_file)
ipd.Audio(waveform, rate=sampling_rate)

In [25]:
# Define the path to the directory
data_dir = Path("/home/jupyter/advanced")

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open(data_dir / "asr.jsonl") as reader:
    for obj in reader:
        for key, value in obj.items():
            if key == 'audio':
                data[key].append("/home/jupyter/advanced/audio/" + value)
            else:
                data[key].append(value)

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.05 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

In [31]:
#val_dataset['audio']

# Model Evaluation

In [4]:
import jiwer

# Function to calculate WER
def calculate_wer(references, hypotheses):
    wer_scores = []
    for ref, hyp in zip(references, hypotheses):
        wer_score = jiwer.wer(ref, hyp)
        wer_scores.append(wer_score)
    return wer_scores

# Calculate WER for each pair


In [26]:
references = val_dataset['transcript']
hypothesis = []

for out in pipe(KeyDataset(val_dataset, 'audio'), generate_kwargs={"generation_config": generation_config}):
    hypothesis.append(out['text'])

print(len(references))
print(len(hypothesis))

wer_scores = calculate_wer(references, hypothesis)

175
175


In [27]:
wer_scores = calculate_wer(references, hypothesis)
data = pd.DataFrame(dict(hypothesis=hypothesis, reference=references))
data


Unnamed: 0,hypothesis,reference
0,Torret Romeo heading two six five engage brown camouflage fighter plane with anti-air artillery target confirmed strike with precision stand by for impact,"Turret Romeo, heading two six five, engage brown camouflage fighter plane with anti-air artillery. Target confirmed. Strike with precision. Stand by for impact."
1,Control tower to air defense turret. Target is a white drone heading one eight zero. Deploy machine gun.,"Control tower to air defense turrets, target is a white drone heading one eight zero, deploy machine gun."
2,"Turret Bravo, deploy EMP on the Silver, Blue and Purple commercial aircraft heading two-niner-zero. Target locked, awaiting confirmation.","Turret Bravo, deploy EMP on the silver, blue, and purple commercial aircraft heading two niner zero. Target locked, awaiting confirmation."
3,"Control to Air Defence Turrets, deploy EMP towards the green, white and silver missile at heading zero six five. Engage and neutralise the target immediately. Over.","Control to air defense turrets, deploy EMP towards the green, white, and silver missile at heading zero six five. Engage and neutralize the target immediately. Over."
4,"Turret Alpha, deploy surface-to-air missiles at heading one five five. Target the orange, brown and black fighter plane. Defensive measures initiated. Standby for impact.","Turret Alpha, deploy surface-to-air missiles at heading one five five. Target the orange, brown, and black fighter plane. Defensive measures initiated. Standby for impact."
...,...,...
170,Control tower to Taurus. Prepare to deploy electromagnetic pulse. Target is a red commercial aircraft on heading one three five. Take action immediately.,"Control tower to turrets, prepare to deploy electromagnetic pulse. Target is a red commercial aircraft on heading one three five. Take action immediately."
171,"Engage target, white and orange helicopter heading one-niner-five with machine gun.","Engage target, white and orange helicopter, heading one niner five, with machine gun."
172,Control to air defense turrets. Focus your attention to heading two two five. Prepare to deploy EMP tool against the brown and white commercial aircraft. Engage and neutralize the target swiftly. Over.,"Control to air defense turrets, focus your attention to heading two two five. Prepare to deploy EMP tool against the brown and white commercial aircraft. Engage and neutralize the target swiftly. Over."
173,Control here. Prepare to engage Brown Fighter Jet at heading two-four-five using machine gun. Fire at will. Over.,Control here. Prepare to engage brown fighter jet at heading two four five using machine gun. Fire at will. Over.


In [30]:


wer = jiwer.wer(list(data["hypothesis"]), list(data["reference"]))

print(f"WER: {wer * 100:.2f} %")

# Display the results
for i, score in enumerate(wer_scores):
    print(f"Reference {i+1}: {references[i]}")
    print(f"Hypothesis {i+1}: {hypothesis[i]}")
    print(f"WER: {score:.2%}\n")

WER: 21.08 %
Reference 1: Turret Romeo, heading two six five, engage brown camouflage fighter plane with anti-air artillery. Target confirmed. Strike with precision. Stand by for impact.
Hypothesis 1:  Torret Romeo heading two six five engage brown camouflage fighter plane with anti-air artillery target confirmed strike with precision stand by for impact
WER: 43.48%

Reference 2: Control tower to air defense turrets, target is a white drone heading one eight zero, deploy machine gun.
Hypothesis 2:  Control tower to air defense turret. Target is a white drone heading one eight zero. Deploy machine gun.
WER: 22.22%

Reference 3: Turret Bravo, deploy EMP on the silver, blue, and purple commercial aircraft heading two niner zero. Target locked, awaiting confirmation.
Hypothesis 3:  Turret Bravo, deploy EMP on the Silver, Blue and Purple commercial aircraft heading two-niner-zero. Target locked, awaiting confirmation.
WER: 30.00%

Reference 4: Control to air defense turrets, deploy EMP towa