#Install the required dependencies for the model

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer
!pip install datasets==1.18.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_8dwxmm0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_8dwxmm0
  Resolved https://github.com/openai/whisper.git to commit 5c1a8c10e762bf9c29fcf6b3e40f17bc8ab09864
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.19.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install pytube

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-12.1.2-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-12.1.2


### Import the dependencies

In [None]:
import whisper
from pytube import YouTube
from glob import glob
import os
import pandas as pd
from tqdm.notebook import tqdm

### Load the timit dataset

In [None]:
from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr")

Downloading:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading and preparing dataset timit_asr/clean (download: 828.75 MiB, generated: 7.90 MiB, post-processed: Unknown size, total: 836.65 MiB) to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad...


Downloading:   0%|          | 0.00/869M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset timit_asr downloaded and prepared to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### Word error rate on timit dataset using different sizes of Whisper Model

* Note that since the requirement is to filter english data against non-english we have opted to select multilingual models rather than going for the model which is used only for english. Here we all are aware that timit dataset is purely English dataset still keeping this 
8requirement in mind ,we will make use of the same procedure ie first language detection and later recognition 

* Also, I have considered the train split in timit dataset to check word error rate on a larger collection of audio signals.

### Tiny model (1GB)(39 Million learning parameters and 32x speed) 

```
# This is formatted as code
```



In [None]:
model = whisper.load_model('tiny')

100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 120MiB/s]


In [None]:
timit['train']['text'][0]

'Would such an act of refusal be useful?'

In [None]:
def lan_detector(audio_file):
  print('reading the audio file')
  audio = whisper.load_audio(audio_file)
  audio = whisper.pad_or_trim(audio)
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  _, probs = model.detect_language(mel)
  if max(probs, key=probs.get) == 'en':
    return True
  return False

In [None]:
def speech2text(audio_file):
  text = model.transcribe(audio_file)
  return text["text"]

In [None]:
speech2text(timit['train']['file'][0])

' with such an active refusal be useful.'

In [None]:
extracted_text = []
for i in tqdm(timit['train']['file']):
  #if lan_detector(i) == True:
    extracted_text.append(speech2text(i))

  0%|          | 0/4620 [00:00<?, ?it/s]

In [None]:
len(extracted_text)

4620

In [None]:
len(timit['train']['text'])

4620

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['train']['text']})

In [None]:
data

Unnamed: 0,hypothesis,reference
0,with such an active refusal be useful.,Would such an act of refusal be useful?
1,Don't ask me to carry an early rag like that.,Don't ask me to carry an oily rag like that.
2,Better Scotch Fudge goes well with vanilla ic...,Butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.
...,...,...
4615,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...
4617,Movies never have enough villains.,Movies never have enough villains.
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?


In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,with such an active refusal be useful.,Would such an act of refusal be useful?,with such an active refusal be useful,would such an act of refusal be useful
1,Don't ask me to carry an early rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an early rag like that,do not ask me to carry an oily rag like that
2,Better Scotch Fudge goes well with vanilla ic...,Butterscotch fudge goes well with vanilla ice ...,better scotch fudge goes well with vanilla ice...,butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.,i honor my mom,i honor my mom
...,...,...,...,...
4615,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...,the water contained too much chlorine and stun...,the water contained too much chlorine and stun...
4617,Movies never have enough villains.,Movies never have enough villains.,movies never have enough villains,movies never have enough villains
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?,does hindu ideology honor cows,does hindu ideology honor cows


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 12.42 %


In [None]:
extracted_text = []
for i in tqdm(timit['test']['file']):
  extracted_text.append(speech2text(i))

  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['test']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,The bungalow was pleasantly situated near the...,The bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...
1,Don't ask me to carry in a really rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry in a really rag like that,do not ask me to carry an oily rag like that
2,Are you looking for employment?,Are you looking for employment?,are you looking for employment,are you looking for employment
3,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4,At Twilight on the 12th day we'll have Shibley.,At twilight on the twelfth day we'll have Chab...,at twilight on the 12th day we will have shibley,at twilight on the 12th day we will have chablis
...,...,...,...,...
1675,Pam gives driving lessons on Thursdays.,Pam gives driving lessons on Thursdays.,pam gives driving lessons on thursdays,pam gives driving lessons on thursdays
1676,He rubbed his eyes sleepily with one huge paw.,He rubbed his eyes sleepily with one huge paw.,he rubbed his eyes sleepily with one huge paw,he rubbed his eyes sleepily with one huge paw
1677,Eight field guns were captured in position.,Eight field guns were captured in position.,8 field guns were captured in position,8 field guns were captured in position
1678,"A low in them, silverware can often be flimsy.",Aluminum silverware can often be flimsy.,a low in them silverware can often be flimsy,aluminum silverware can often be flimsy


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 10.85 %


### BASE MODEL

Load the model

In [None]:
model = whisper.load_model('base')

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 51.9MiB/s]


In [None]:
extracted_text = []
for i in tqdm(timit['train']['file']):
  #if lan_detector(i) == True:
    extracted_text.append(speech2text(i))

  0%|          | 0/4620 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['train']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,with such an active refusal be useful.,Would such an act of refusal be useful?,with such an active refusal be useful,would such an act of refusal be useful
1,Don't ask me to carry an Oli rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oli rag like that,do not ask me to carry an oily rag like that
2,better scotch fudge goes well with vanilla ic...,Butterscotch fudge goes well with vanilla ice ...,better scotch fudge goes well with vanilla ice...,butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.,i honor my mom,i honor my mom
...,...,...,...,...
4615,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...,the water contained too much chlorine and stun...,the water contained too much chlorine and stun...
4617,movies never have enough villains.,Movies never have enough villains.,movies never have enough villains,movies never have enough villains
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?,does hindu ideology honor cows,does hindu ideology honor cows


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 8.67 %


In [None]:
extracted_text = []
for i in tqdm(timit['test']['file']):
  extracted_text.append(speech2text(i))

  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['test']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,The bungalow was pleasantly situated near the...,The bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...
1,Don't ask me to carry an oily rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oily rag like that,do not ask me to carry an oily rag like that
2,Are you looking for employment?,Are you looking for employment?,are you looking for employment,are you looking for employment
3,She had your dark suit and greasy washwater a...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy washwater al...,she had your dark suit in greasy wash water al...
4,At twilight on the twelfth day we'll have Shi...,At twilight on the twelfth day we'll have Chab...,at twilight on the 12th day we will have shibli,at twilight on the 12th day we will have chablis
...,...,...,...,...
1675,Pam gives driving lessons on Thursdays.,Pam gives driving lessons on Thursdays.,pam gives driving lessons on thursdays,pam gives driving lessons on thursdays
1676,He rubbed his eye sleepily with one huge paw.,He rubbed his eyes sleepily with one huge paw.,he rubbed his eye sleepily with one huge paw,he rubbed his eyes sleepily with one huge paw
1677,Eight fuel guns were captured in position.,Eight field guns were captured in position.,8 fuel guns were captured in position,8 field guns were captured in position
1678,"Alone in him, Silverware can often be flimsy.",Aluminum silverware can often be flimsy.,alone in him silverware can often be flimsy,aluminum silverware can often be flimsy


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 7.59 %


### Small model

In [None]:
model = whisper.load_model('small')

100%|███████████████████████████████████████| 461M/461M [00:26<00:00, 18.6MiB/s]


In [None]:
extracted_text = []
for i in tqdm(timit['train']['file']):
  #if lan_detector(i) == True:
    extracted_text.append(speech2text(i))

  0%|          | 0/4620 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['train']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,Would such an act of refusal be useful?,Would such an act of refusal be useful?,would such an act of refusal be useful,would such an act of refusal be useful
1,Don't ask me to carry an Olly rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an olly rag like that,do not ask me to carry an oily rag like that
2,Butterscotch fudge goes well with vanilla ice...,Butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.,i honor my mom,i honor my mom
...,...,...,...,...
4615,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...,the water contained too much chlorine and stun...,the water contained too much chlorine and stun...
4617,Movies never have enough villains.,Movies never have enough villains.,movies never have enough villains,movies never have enough villains
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?,does hindu ideology honor cows,does hindu ideology honor cows


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 4.67 %


In [None]:
extracted_text = []
for i in tqdm(timit['test']['file']):
  extracted_text.append(speech2text(i))

  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['test']['text']})

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,The bungalow was pleasantly situated near the...,The bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...
1,Don't ask me to carry an oily rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oily rag like that,do not ask me to carry an oily rag like that
2,Are you looking for employment?,Are you looking for employment?,are you looking for employment,are you looking for employment
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,"At twilight on the twelfth day, we'll have Sh...",At twilight on the twelfth day we'll have Chab...,at twilight on the 12th day we will have shiblee,at twilight on the 12th day we will have chablis
...,...,...,...,...
1675,Pam gives driving lessons on Thursdays.,Pam gives driving lessons on Thursdays.,pam gives driving lessons on thursdays,pam gives driving lessons on thursdays
1676,He rubbed his eyes sleepily with one huge paw.,He rubbed his eyes sleepily with one huge paw.,he rubbed his eyes sleepily with one huge paw,he rubbed his eyes sleepily with one huge paw
1677,Eight fuel guns were captured in position.,Eight field guns were captured in position.,8 fuel guns were captured in position,8 field guns were captured in position
1678,Aluminum silverware can often be flimsy.,Aluminum silverware can often be flimsy.,aluminum silverware can often be flimsy,aluminum silverware can often be flimsy


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 3.74 %


### Medium model

In [None]:
model = whisper.load_model('medium')

100%|██████████████████████████████████████| 1.42G/1.42G [00:05<00:00, 263MiB/s]


In [None]:
extracted_text = []
for i in tqdm(timit['train']['file']):
  #if lan_detector(i) == True:
    extracted_text.append(speech2text(i))

  0%|          | 0/4620 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['train']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,Would such an act of refusal be useful?,Would such an act of refusal be useful?,would such an act of refusal be useful,would such an act of refusal be useful
1,Don't ask me to carry an Olli rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an olli rag like that,do not ask me to carry an oily rag like that
2,Butterscotch fudge goes well with vanilla ice...,Butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.,i honor my mom,i honor my mom
...,...,...,...,...
4615,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...,the water contained too much chlorine and stun...,the water contained too much chlorine and stun...
4617,Movies never have enough villains.,Movies never have enough villains.,movies never have enough villains,movies never have enough villains
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?,does hindu ideology honor cows,does hindu ideology honor cows


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 3.74 %


In [None]:
extracted_text = []
for i in tqdm(timit['test']['file']):
  extracted_text.append(speech2text(i))

  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['test']['text']})

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,the bungalow was pleasantly situated near the...,The bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...
1,Don't ask me to carry an oily rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oily rag like that,do not ask me to carry an oily rag like that
2,Are you looking for employment?,Are you looking for employment?,are you looking for employment,are you looking for employment
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,At twilight on the twelfth day we'll have Shi...,At twilight on the twelfth day we'll have Chab...,at twilight on the 12th day we will have shibli,at twilight on the 12th day we will have chablis
...,...,...,...,...
1675,Pam gives driving lessons on Thursdays.,Pam gives driving lessons on Thursdays.,pam gives driving lessons on thursdays,pam gives driving lessons on thursdays
1676,He rubbed his eyes sleepily with one huge paw.,He rubbed his eyes sleepily with one huge paw.,he rubbed his eyes sleepily with one huge paw,he rubbed his eyes sleepily with one huge paw
1677,Eight fuel guns were captured in position.,Eight field guns were captured in position.,8 fuel guns were captured in position,8 field guns were captured in position
1678,Aluminum silverware can often be flimsy.,Aluminum silverware can often be flimsy.,aluminum silverware can often be flimsy,aluminum silverware can often be flimsy


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 3.06 %


### Large

In [None]:
model = whisper.load_model('large')

100%|██████████████████████████████████████| 2.87G/2.87G [00:29<00:00, 106MiB/s]


In [None]:
extracted_text = []
for i in tqdm(timit['train']['file']):
  if lan_detector(i) == True:
    extracted_text.append(speech2text(i))

  0%|          | 0/4620 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['train']['text']})

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,Would such an act of refusal be useful?,Would such an act of refusal be useful?,would such an act of refusal be useful,would such an act of refusal be useful
1,Don't ask me to carry an Oli rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oli rag like that,do not ask me to carry an oily rag like that
2,Butterscotch fudge goes well with vanilla ice...,Butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...,butterscotch fudge goes well with vanilla ice ...
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,I honor my mom.,I honor my mom.,i honor my mom,i honor my mom
...,...,...,...,...
4615,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4616,The water contained too much chlorine and stu...,The water contained too much chlorine and stun...,the water contained too much chlorine and stun...,the water contained too much chlorine and stun...
4617,Movies never have enough villains.,Movies never have enough villains.,movies never have enough villains,movies never have enough villains
4618,Does Hindu ideology honor cows?,Does Hindu ideology honor cows?,does hindu ideology honor cows,does hindu ideology honor cows


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 3.06 %


In [None]:
extracted_text = []
for i in tqdm(timit['test']['file']):
  extracted_text.append(speech2text(i))

  0%|          | 0/1680 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame.from_dict({"hypothesis":extracted_text,"reference":timit['test']['text']})

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,The bungalow was pleasantly situated near the...,The bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...,the bungalow was pleasantly situated near the ...
1,Don't ask me to carry an oily rag like that.,Don't ask me to carry an oily rag like that.,do not ask me to carry an oily rag like that,do not ask me to carry an oily rag like that
2,Are you looking for employment?,Are you looking for employment?,are you looking for employment,are you looking for employment
3,She had your dark suit and greasy wash water ...,She had your dark suit in greasy wash water al...,she had your dark suit and greasy wash water a...,she had your dark suit in greasy wash water al...
4,"At twilight on the 12th day, we'll have shibley.",At twilight on the twelfth day we'll have Chab...,at twilight on the 12th day we will have shibley,at twilight on the 12th day we will have chablis
...,...,...,...,...
1675,Pam gives driving lessons on Thursdays.,Pam gives driving lessons on Thursdays.,pam gives driving lessons on thursdays,pam gives driving lessons on thursdays
1676,He rubbed his eyes sleepily with one huge paw.,He rubbed his eyes sleepily with one huge paw.,he rubbed his eyes sleepily with one huge paw,he rubbed his eyes sleepily with one huge paw
1677,Eight field guns were captured in position.,Eight field guns were captured in position.,8 field guns were captured in position,8 field guns were captured in position
1678,Aluminum silverware can often be flimsy.,Aluminum silverware can often be flimsy.,aluminum silverware can often be flimsy,aluminum silverware can often be flimsy


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 2.50 %


### Transcribing youtube link using Whisper Model

### Step1 : Extract audio from the youtube link

In [None]:
def youtube_audio(link):
    youtube_1 = YouTube(link)
    videos = youtube_1.streams.filter(only_audio=True)

    name = str(link.split('=')[-1])
    out_file = videos[0].download(name)
    
    link = name.split('=')[-1]
    new_filename = link+".wav"
    print(new_filename)
    os.rename(out_file, new_filename)
    print(name)
    return new_filename,link

In [None]:
youtube_audio('https://www.youtube.com/watch?v=PZ7lDrwYdZc')

PZ7lDrwYdZc.wav
PZ7lDrwYdZc


('PZ7lDrwYdZc.wav', 'PZ7lDrwYdZc')

In [None]:
path="/content/PZ7lDrwYdZc.wav"

In [None]:
extracted_text = []
if lan_detector(path) == True:
    extracted_text.append(speech2text('/content/PZ7lDrwYdZc.wav'))

reading the audio file


In [None]:
extracted_text

[" Do you ever feel like you're just floating through life, but not actually getting closer to the person that you want to be? It usually happens around New Years. You imagine all the bad habits you're going to break free from, and all the good habits you will begin. This time it will be different you say to yourself. This time I am going to do the things that I say I will, only to end up back where you began shortly after, and no closer to what you had envisaged. So the question is, how do you break free from bad habits and make the habits you desire easier and automatic? Atomic Habits by James Clear answers all these questions. We're going to be doing a fast paced, detailed summary of the book, and dive deep into topics like habit loops, dopamine spikes, priming your environment, plus heaps more. And make sure you stick around until the end of the video where I go through step by step how I'm personally using this book to improve my own habits. I hope this summary inspires you to go 