In [62]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [63]:
from pytube import YouTube
import os,shutil
import re

import audioread
from IPython.display import Audio
import librosa
from pydub import AudioSegment, silence

import torch
from torchmetrics import WordErrorRate
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, Wav2Vec2Processor, HubertForCTC


In [64]:
## Load pre-trained Hubert model (used Connectionist Temporal Classification/CTC loss ) from HuggingFace
## This particular model works for English, however threre are models that support other languages too

model_name = "facebook/hubert-large-ls960-ft"
stt_tokenizer = Wav2Vec2Processor.from_pretrained(model_name)
stt_model = HubertForCTC.from_pretrained(model_name)

In [65]:
def download_audio(url=None):
    if url!=None:
      yt=YouTube(url)
      print(yt.title)
      stream=list(yt.streams.filter(only_audio=True, file_extension='mp4'))
      stream[0].download() # stream has all .mp4 audios
    else:
      print("Invalid url,can't download")

In [66]:
#url="https://www.youtube.com/watch?v=MihlCysVWNs"
url="https://www.youtube.com/watch?v=YVQzFCPkgt4&list=PLreVlKwe2Z0QIdDwvVoa_3QSMifIF1w1A&index=7"
download_audio(url)

Auto_TS : Automatically build multiple Time Series models using a Single Line of Code


In [67]:
def convert_to_wav(input_filename):
  ext=input_filename[-3:]
  output_filename=filename[:-3]+"wav"
  if ext=="mp3":
    sound = AudioSegment.from_mp3(input_filename)
  else:
    sound = AudioSegment.from_file(input_filename,format=ext)
  sound = sound.set_frame_rate(16000)
  sound.export(output_filename,format="wav")
  os.remove(input_filename)

In [68]:
## we need audio in wav format + sample rate 16K Hz
#filename="/content/Wake up to Reality - Madara Uchihas words.mp4"
filename="/content/Auto_TS  Automatically build multiple Time Series models using a Single Line of Code.mp4"
convert_to_wav(filename)

In [69]:
## create temporary directory to store 
tmp_dir="/content/audio_chunks"

shutil.rmtree(f"{tmp_dir}/",ignore_errors=True)
os.makedirs(tmp_dir)


In [70]:
def silence_detection(audio):
  dBs=audio.dBFS # get decibels 
  silence_list=silence.detect_silence(audio,min_silence_len=750,silence_thresh=dBs-14)
  return silence_list

In [71]:
filename=filename[:-3]+"wav"
audio = AudioSegment.from_file(filename)
silence_list = silence_detection(audio)
silence_list

[[4477, 6429],
 [8832, 10119],
 [15280, 16134],
 [22575, 23726],
 [29065, 30415],
 [37676, 39420],
 [48085, 49405],
 [54522, 56033],
 [70572, 71397],
 [73301, 76878],
 [83846, 86474],
 [94592, 98209],
 [101881, 106344],
 [110106, 111386],
 [114649, 117478],
 [122364, 128205],
 [132320, 135841],
 [139834, 140687],
 [145659, 149611],
 [154680, 158090],
 [162004, 166624],
 [174826, 177092],
 [182097, 185632],
 [188601, 189433],
 [190714, 191983],
 [197525, 200601],
 [210515, 214901],
 [220083, 222466],
 [224895, 225751],
 [228410, 229470],
 [231457, 236276],
 [238352, 240368],
 [245487, 246586],
 [251237, 253501],
 [257823, 259899],
 [264357, 272989],
 [275352, 280145],
 [284344, 285130],
 [290440, 294604],
 [305327, 311288],
 [317976, 319614],
 [326010, 327290],
 [328655, 330109],
 [342293, 344706],
 [346967, 347764],
 [351473, 354214],
 [359179, 364497],
 [377940, 379242],
 [381036, 384828],
 [390119, 391770],
 [399648, 402143],
 [404214, 405536],
 [409482, 412105],
 [414041, 417882],
 

In [72]:
# test = audio[54775:55690]
# path = "/content/test_3.wav"
# test.export(path) #Exports to a mp3 file in the current path.
    
# Audio(path, autoplay=False)

In [73]:
## while breaking into chunks we need to take care of following points



def create_chunk(audio,silence_list,threshold=14,max_interval=20000):

    audio_length = int(audio.duration_seconds)*1000 ## we need value in (ms) not (s)
    non_silent_chunk=[]
    if len(silence_list)>0:
        ## for 1st chunk 
        if silence_list[0][0]!=0:
          nss=0 # non-silence chunk start
          nse=silence_list[0][0] # non-silence chunk end
          non_silent_chunk.append([nss,nse])
        for idx in range(1,len(silence_list)):
          nss=silence_list[idx-1][1]  # end of previous silence-chunk
          nse=silence_list[idx][0]  # start of current silence-chunk
          non_silent_chunk.append([nss,nse])

        # after last silence chunk 
        if silence_list[-1][1]!=audio_length:
          nss=silence_list[-1][1]
          nse=audio_length
          non_silent_chunk.append([nss,nse])
    else:
      non_silent_chunk.append([0,audio_length])

    print("non_silent_chunk : ",non_silent_chunk)
    new_non_silent_chunk = [] # we break larger non-silence chunk to smaller sub-chunks

    for idx in range(len(non_silent_chunk)):
      start=non_silent_chunk[idx][0]
      end=non_silent_chunk[idx][1]
      interval = end-start
      if interval>max_interval:
        s=start
        while interval>max_interval:
          e=s+max_interval+threshold
          interval=interval-max_interval
          new_non_silent_chunk.append([s,e])
          s=e-threshold
          start=s 
        if interval<=max_interval:
          end=start+interval
          new_non_silent_chunk.append([start,end])
      else:
        new_non_silent_chunk.append([start,end])

    return new_non_silent_chunk




In [74]:
# test = audio[75690:95704]
# path = "/content/test_4.wav"
# test.export(path) #Exports to a mp3 file in the current path.
# Audio(path, autoplay=False)

In [75]:
import torch

In [76]:
def transcribe_audio(path,stt_model,stt_tokenizer,audio,start,end,overlap=15):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    try:
        with torch.no_grad():
            new_audio = audio[start:end] 
            new_audio.export(path)  
            print(path)
            input_audio,sr=librosa.load(path,sr=16000)
            input_values = stt_tokenizer(input_audio,return_tensors="pt").to(device).input_values
            logits = stt_model.to(device)(input_values).logits
            prediction = torch.argmax(logits, dim=-1)
            transcription = stt_tokenizer.batch_decode(prediction)[0].lower()
            transcription_start=transcription[:overlap]
            transcription_end=transcription[-overlap:]
            return transcription,transcription_start,transcription_end
    except audioread.NoBackendError:
        print("start value of chunk > end value of chunk")
        exit()


In [77]:

new_non_silent_chunk = create_chunk(audio,silence_list,threshold=14,max_interval=20000)
print("new_non_silent_chunk",new_non_silent_chunk)

non_silent_chunk :  [[0, 4477], [6429, 8832], [10119, 15280], [16134, 22575], [23726, 29065], [30415, 37676], [39420, 48085], [49405, 54522], [56033, 70572], [71397, 73301], [76878, 83846], [86474, 94592], [98209, 101881], [106344, 110106], [111386, 114649], [117478, 122364], [128205, 132320], [135841, 139834], [140687, 145659], [149611, 154680], [158090, 162004], [166624, 174826], [177092, 182097], [185632, 188601], [189433, 190714], [191983, 197525], [200601, 210515], [214901, 220083], [222466, 224895], [225751, 228410], [229470, 231457], [236276, 238352], [240368, 245487], [246586, 251237], [253501, 257823], [259899, 264357], [272989, 275352], [280145, 284344], [285130, 290440], [294604, 305327], [311288, 317976], [319614, 326010], [327290, 328655], [330109, 342293], [344706, 346967], [347764, 351473], [354214, 359179], [364497, 377940], [379242, 381036], [384828, 390119], [391770, 399648], [402143, 404214], [405536, 409482], [412105, 414041], [417882, 419399], [423122, 427960], [42

In [78]:
overlap=20
overlapping_transcription=[]
transcription= ""
root_path="/content/audio_chunks"
for idx in range(len(new_non_silent_chunk)):
    start=new_non_silent_chunk[idx][0]
    end=new_non_silent_chunk[idx][1]
    path=f"{root_path}/chunk_{idx}.wav"
    orginal_trans,trans_start,trans_end=transcribe_audio(path,stt_model,stt_tokenizer,audio,start,end,overlap)
    transcription+=orginal_trans+" "
    overlapping_transcription.append([trans_start,trans_end])

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_0.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_1.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_2.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_3.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_4.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_5.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_6.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_7.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_8.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_9.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_10.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_11.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_12.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_13.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_14.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_15.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_16.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_17.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_18.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_19.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_20.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_21.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_22.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_23.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_24.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_25.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_26.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_27.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_28.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_29.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_30.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_31.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_32.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_33.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_34.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_35.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_36.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_37.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_38.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_39.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_40.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_41.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_42.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_43.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_44.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_45.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_46.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_47.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_48.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_49.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_50.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_51.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_52.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_53.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_54.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_55.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_56.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_57.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_58.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_59.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_60.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_61.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_62.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_63.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_64.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_65.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_66.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_67.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_68.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_69.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_70.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_71.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_72.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_73.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_74.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_75.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_76.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_77.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_78.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_79.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_80.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_81.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_82.wav


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


/content/audio_chunks/chunk_83.wav


In [79]:
transcription



In [80]:
overlapping_transcription

[['ete e e   e  e  an a', ' e   e  e  an an nod'],
 ['every one i hope all', 'pe all of your doing'],
 ['ve you ever felt the', 'ies forecasting mode'],
 ['using arima esarima ', ' a single lane of co'],
 ['el guess what i disc', 'as otto under scor p'],
 ['otto underscore t s ', 'lp of single line of'],
 ['fore i show you the ', ' this amazing libray'],
 ['aladd the link to th', 'eel free to check it'],
 ['without wasting any ', 'se and machine learn'],
 ["that let's git start", ' git start the reedi'],
 ['let us kixstart the ', 'ai pandas and matlot'],
 ['ihave two files cran', 'so let me run the se'],
 ['eme now go forward a', 'f the training ratas'],
 ['as you can clearly s', 'o columns dik and se'],
 ['me now go forward an', 'imensions of my data'],
 ['as yiu can clearly s', 'ree rows and two col'],
 ['me also show you the', 'frinlet me run the s'],
 ['as heu can clearly s', ' have any null value'],
 ['and my first column ', ' float sixty four co'],
 ['me now go forward an', 't so

In [81]:
model_name="flexudy/t5-small-wav2vec2-grammar-fixer"
t5_tokenizer=T5Tokenizer.from_pretrained(model_name)
t5_model=T5ForConditionalGeneration.from_pretrained(model_name)


In [82]:
def add_punctuation(t5_model,t5_tokenizer,transcription):
    input_text="fix:{"+transcription+"}</s>"
    input_ids=t5_tokenizer.encode(input_text,return_tensors="pt",max_length=10000,truncation=True,add_special_tokens=True)
    outputs=t5_model.generate(input_ids=input_ids,max_length=256,num_beams=4,repetition_penalty=1.0,
                              length_penalty=1.0,early_stopping=True)
    transcription=t5_tokenizer.decode(outputs[0],skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return transcription

In [83]:
def split_text(transcription,max_size):
    cut2=max_size
    split_text_list=[]
    nearest_idx=0
    length=len(transcription)
    
    if cut2==length:   #  add complete text
        split_text_list.append(transcription)
    else:
        while cut2<=length:
            cut1=nearest_idx
            cut2=nearest_idx+max_size
            # split by period(.)
            dots_idxs=[idx for idx,char in enumerate(transcription[cut1:cut2]) if char == "."]
            if len(dots_idxs):
                nearest_idx=max(dots_idxs)+1+cut1
            else:     # split by space('\b') , same as above
                spaces_idxs=[idx for idx,char in enumerate(transcription[cut1:cut2]) if char == " "]
                if len(spaces_idxs):
                    nearest_idx=max(spaces_idxs)+1+cut1
                else:
                    nearest_idx=cut2+cut1
            split_text_list.append(transcription[cut1:nearest_idx])

    return split_text_list


In [84]:
# !pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git

In [85]:

# from gramformer import Gramformer
# !python3 -m spacy download en
# import spacy.cli 

# spacy.cli.download("en_core_web_md")
# def set_seed(seed):
#   torch.manual_seed(seed)
#   if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(seed)

# set_seed(1212)

In [86]:
tmp_transcription=transcription
split_text_list=split_text(tmp_transcription+" ",512)
punctuated_text=""
#gf = Gramformer(models = 1, use_gpu=True) # 1=corrector, 2=detector
for split_text in split_text_list:
    tmp_text=add_punctuation(t5_model,t5_tokenizer,split_text)
    #corrected_sentence = gf.correct(tmp_text,max_candidates=1)
    #punctuated_text+=str(corrected_sentence)
    punctuated_text+=tmp_text



In [87]:
punctuated_text



In [89]:
# !pip install micropython-pkg_resources

In [90]:
# import pkg_resources
# from symspellpy.symspellpy import SymSpell, Verbosity

# # build symspell tree 
# sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# #loading dictionary/lexicon
# dictionary_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_dictionary_en_82_765.txt")
# # term_index is the column of the term and count_index is the
# # column of the term frequency
# sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# # lookup suggestions for single-word input strings
# input_term = "memebers"  # misspelling of "members"
# # max edit distance per lookup
# print("Misspelled Word - ",input_term)
# # (max_edit_distance_lookup <= max_dictionary_edit_distance)
# suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST,
#                                max_edit_distance=2)
# # display suggestion term, term frequency, and edit distance
# print("Best Candidate - ")
# for suggestion in suggestions:
#     print(suggestion)