In [1]:
!pip install -q --upgrade torch torchvision torchaudio
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q accelerate optimum bitsandbytes
!pip install -q ipython-autotime

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
faster-whisper 0.7.1 requires tokenizers==0.13.*, but you have tokenizers 0.14.1 which is incompatible.[0m[31m
[0m

In [1]:
%load_ext autotime

time: 286 µs (started: 2023-10-13 13:44:34 +00:00)


In [5]:
!wget https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/4469669.mp3

--2023-10-13 13:06:41--  https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/4469669.mp3
Resolving huggingface.co (huggingface.co)... 18.172.134.24, 18.172.134.4, 18.172.134.88, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.24|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/96/e4/96e4f69cd112b019dd764318570e47e5fe96de53d8c32a99d745e72d9086e355/09251982f6a864867d829525e510cdb53b421291d6629485d1f8ddc23e347512?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%274469669.mp3%3B+filename%3D%224469669.mp3%22%3B&response-content-type=audio%2Fmpeg&Expires=1697461601&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzQ2MTYwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy85Ni9lNC85NmU0ZjY5Y2QxMTJiMDE5ZGQ3NjQzMTg1NzBlNDdlNWZlOTZkZTUzZDhjMzJhOTlkNzQ1ZTcyZDkwODZlMzU1LzA5MjUxOTgyZjZhODY0ODY3ZDgyOTUyNWU1MTBjZGI1M2I0MjEyOTFkNjYy

## BetterTransformer

In [3]:
import torch
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    "openai/whisper-large-v2",
    torch_dtype=torch.float16,
    device="cuda:0"
)

pipe.model = pipe.model.to_bettertransformer()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [6]:
torch_out = pipe(
     "4469669.mp3",
     chunk_length_s=30,
     batch_size=16,
     return_timestamps=True,
)

time: 2min 37s (started: 2023-10-13 13:21:05 +00:00)


In [25]:
with open("torch_output.txt", "w") as f:
    for s in torch_out["chunks"]:
        start, end = s["timestamp"]
        f.write(f"[{start} - {end}] {s['text']}\n")

time: 6.92 ms (started: 2023-10-13 13:42:35 +00:00)


In [7]:
# This is failing with CUDA OOM

# torch_out_with_config = pipe(
#     "4469669.mp3",
#     chunk_length_s=30,
#     batch_size=16,
#     return_timestamps=True,
#     generate_kwargs=dict(
#         temperature=0.0,
#         num_beams=3,
#         repetition_penalty=1.0,
#         do_sample=False,
#     ),
# )

## Faster-Whisper

In [17]:
from faster_whisper import WhisperModel

model = WhisperModel(
    "large-v2", device="cuda", device_index=0, compute_type="float16",
)

time: 13.4 s (started: 2023-10-13 13:26:48 +00:00)


In [19]:
segments, _ = model.transcribe(
    "4469669.mp3",
    language="en",
    beam_size=1,
    repetition_penalty=1.0,
    compression_ratio_threshold=2.4,
    log_prob_threshold=-1.0,
    no_speech_threshold=0.6,
    condition_on_previous_text=True,
    suppress_blank=False,
    word_timestamps=False,
)
faster_whisper_out = list(segments)

Estimating duration from bitrate, this may be inaccurate


time: 5min 5s (started: 2023-10-13 13:28:01 +00:00)


In [23]:
with open("faster_whisper_output.txt", "w") as f:
    for s in faster_whisper_out:
        f.write(f"[{s.start:.2f}, {s.end:.2f}] {s.text}\n")
    

time: 5.15 ms (started: 2023-10-13 13:39:53 +00:00)


## Comparison using NLTK

In [10]:
import re
import string

import nltk
nltk.download("punkt")

from nltk.tokenize import word_tokenize


# Function to remove brackets at the beginning of a line
def remove_brackets(text):
    # Use a regular expression to match and remove brackets at the beginning of the line
    cleaned_text = re.sub(r'^\[[^\]]*\]\s*', '', text)

    return cleaned_text


def preprocess_and_tokenize(text):
    # Convert to lowercase and remove leading/trailing whitespace
    text = text.lower().strip()

    # Remove punctuation using string.punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Tokenize and remove punctuation
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalnum()]  # Keep alphanumeric tokens

    return set(tokens)  # Convert to a set for Jaccard similarity

# Transcriptions
with open("torch_output.txt", "r") as f:
    content = f.readlines()
    clean_content = [remove_brackets(line) for line in content]
    torch_out = " ".join(clean_content)

with open("faster_whisper_output.txt", "r") as f:
    content = f.readlines()
    clean_content = [remove_brackets(line) for line in content]
    faster_whisper_out = " ".join(clean_content)

# Preprocess and tokenize
tokens1 = preprocess_and_tokenize(torch_out)
tokens2 = preprocess_and_tokenize(faster_whisper_out)

# Calculate Jaccard similarity
intersection = len(tokens1.intersection(tokens2))
union = len(tokens1.union(tokens2))
jaccard_similarity = intersection / union

print(f"Jaccard Similarity: {jaccard_similarity}")


Jaccard Similarity: 0.9084995663486557


[nltk_data] Downloading package punkt to /home/chainyo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
