In [None]:
!pip install deepspeech

!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz
!tar xvf audio-0.9.3.tar.gz
!ls -l ./audio/

In [None]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!sh setup.sh

In [None]:
! pip install youtube_dl
! pip install pydub
! pip install transformers
! pip install sentencepiece
! pip install punctuator
! pip install SpeechRecognition

! apt-get update && apt-get install -y libsndfile1 ffmpeg
! pip install Cython
! pip install nemo_toolkit[all]
! pip install pynini

In [None]:
import librosa
import soundfile
from __future__ import unicode_literals
import youtube_dl
from pydub import AudioSegment
from deepspeech import Model
from scipy.io.wavfile import read as wav_read
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import torch
import speech_recognition as sr
import os
import contextlib
import wave
from nemo.collections.nlp.models import PunctuationCapitalizationModel

In [9]:
def make_transcript(audio_file):
    model_file_path = "/content/deepspeech-0.9.3-models.pbmm"
    lm_file_path = "/content/deepspeech-0.9.3-models.scorer"
    beam_width = 100
    lm_alpha = 0.93
    lm_beta = 1.18

    model = Model(model_file_path)
    model.enableExternalScorer(lm_file_path)

    model.setScorerAlphaBeta(lm_alpha, lm_beta)
    model.setBeamWidth(beam_width)

    rate, buffer= wav_read(audio_file)
    return model.stt(buffer)

In [12]:
def get_tube(url):

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=False)
        video_title = info_dict.get('id', None)

    path = f'{video_title}.mp3'

    ydl_opts.update({'outtmpl':path})

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
        
    return path

In [13]:
def get_transcript(url):
    path =  f"/content/{get_tube(url)}"
    shortcut = path[:-4]
    path_wav = f"{shortcut}.wav"

    sound = AudioSegment.from_file(path)
    sound.export(path_wav, format="wav")

    os.remove(path) 

    audio, sr = librosa.load(path_wav, sr=16000)
    soundfile.write(path_wav, data = audio, samplerate = sr)
    
    transcript = make_transcript(path_wav)

    os.remove(path_wav)

    return transcript

In [20]:
def transcript_summarizer(url):
    transcript = get_transcript(url)

    PunctuationCapitalizationModel.list_available_models()
    model = PunctuationCapitalizationModel.from_pretrained("punctuation_en_bert")
    transcript = model.add_punctuation_capitalization([transcript])

    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    device = torch.device('cpu')


    t5_prepared_Text = "summarize: "+transcript[0]
    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=4,
                                      no_repeat_ngram_size=2,
                                      min_length=30,
                                      max_length=100,
                                      early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return print("\n\nSummarized text: \n",output)

In [21]:
import time

start = time.time()
transcript_summarizer("https://www.youtube.com/watch?v=A8SAZ8lI4m4")
end = time.time()
print(end - start)

[youtube] A8SAZ8lI4m4: Downloading webpage
[youtube] A8SAZ8lI4m4: Downloading webpage
[download] Destination: A8SAZ8lI4m4.mp3
[download] 100% of 4.00MiB in 01:14
[ffmpeg] Post-process file A8SAZ8lI4m4.mp3 exists, skipping
[NeMo I 2021-11-21 23:23:36 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo.
[NeMo I 2021-11-21 23:23:36 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo
[NeMo I 2021-11-21 23:23:36 common:728] Instantiating model from pre-trained checkpoint
[NeMo I 2021-11-21 23:23:41 tokenizer_utils:123] Getting HuggingFace AutoTokenizer with pretrained_model_name: bert-base-uncased, vocab_file: /tmp/tmpoyrzocxq/tokenizer.vocab_file, special_tokens_dict: {}, and use_fast: False


Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
[NeMo W 2021-11-21 23:23:45 modelPT:131] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    text_file: text_train.txt
    labels_file: labels_train.txt
    shuffle: true
    num_samples: -1
    batch_size: 64
    
[NeMo W 2021-11-21 23:23:45 modelPT:138] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    ds_item: null
    text_file: text_dev.txt
    labels_file: labels_dev.txt
    shuffle: false
    num_samples: -1
    batch_size: 64
    
[NeMo W 2021-11-21 23:23:45 modelPT:144] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a vali

[NeMo I 2021-11-21 23:23:52 save_restore_connector:149] Model PunctuationCapitalizationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo.
[NeMo I 2021-11-21 23:23:52 punctuation_capitalization_model:577] Using batch size 1 for inference
[NeMo I 2021-11-21 23:23:52 punctuation_capitalization_dataset:543] Max length: 64
[NeMo I 2021-11-21 23:23:52 data_preprocessing:358] Some stats of the lengths of the sequences:
[NeMo I 2021-11-21 23:23:52 data_preprocessing:364] Min: 901 |                  Max: 901 |                  Mean: 901.0 |                  Median: 901.0
[NeMo I 2021-11-21 23:23:52 data_preprocessing:366] 75 percentile: 901.00
[NeMo I 2021-11-21 23:23:52 data_preprocessing:367] 99 percentile: 901.00


100%|██████████| 106/106 [00:02<00:00, 39.15batch/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (1150 > 512). Running this sequence through the model will result in indexing errors




Summarized text: 
 aaron ramsey: Until the splitting of five, the war caused this dissolution. he says as of closing years of the Napoleonic wars, Norway was in the union, with no way Formosanta, but it was disbanded by the swedish king in 1814.
353.48488545417786
