In [None]:
!pip install deepspeech

!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/audio-0.9.3.tar.gz
!tar xvf audio-0.9.3.tar.gz
!ls -l ./audio/

In [None]:
! pip install youtube_dl
! pip install pydub
! pip install transformers
! pip install sentencepiece
! pip install punctuator
! pip install SpeechRecognition

! apt-get update && apt-get install -y libsndfile1 ffmpeg
! pip install Cython
! pip install bert-extractive-summarizer
! pip install nemo_toolkit[all]
! pip install pynini
! pip install rake_nltk
! pip install nltk

In [None]:
import librosa
import soundfile
from __future__ import unicode_literals
import youtube_dl
from pydub import AudioSegment
from deepspeech import Model
from scipy.io.wavfile import read as wav_read
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import torch
import speech_recognition as sr
import os
import contextlib
import wave
from nemo.collections.nlp.models import PunctuationCapitalizationModel
import torchvision
from summarizer import Summarizer
import nltk
from rake_nltk import Rake

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [3]:
def make_transcript(audio_file):
    model_file_path = "/content/deepspeech-0.9.3-models.pbmm"
    lm_file_path = "/content/deepspeech-0.9.3-models.scorer"
    beam_width = 100
    lm_alpha = 0.93
    lm_beta = 1.18

    model = Model(model_file_path)
    model.enableExternalScorer(lm_file_path)

    model.setScorerAlphaBeta(lm_alpha, lm_beta)
    model.setBeamWidth(beam_width)

    rate, buffer= wav_read(audio_file)
    return model.stt(buffer)

In [4]:
def capital_letters(text):
    capitalized = ""
    if text[0] != text[0].capitalize():
        capitalized = capitalized + text[0].capitalize()
        text = text[1:]

    for num in range(len(text)):
        if text[num - 2] + text[num - 1] != ". ":
            capitalized = capitalized + text[num]
        if text[num - 2] + text[num - 1] == ". ":
            capitalized = capitalized + text[num].capitalize()

    return capitalized

In [5]:
def set_punctuation(transcript):
    PunctuationCapitalizationModel.list_available_models()
    model = PunctuationCapitalizationModel.from_pretrained("punctuation_en_bert")
    punct_transcript = model.add_punctuation_capitalization([transcript])
    punct_transcript = punct_transcript[0]

    return punct_transcript

In [6]:
def get_tube(url):

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=False)
        video_id = info_dict.get('id', None)
        video_title = info_dict.get('title', None)
        video_duration = info_dict.get('duration', None)

    min = int(video_duration / 60)
    sec = video_duration % 60
    if sec < 10:
      duration = f"{min}:0{sec}"
    else:
      duration = f"{min}:{sec}"
    
    video_info = {}
    video_info["path"] = f'{video_id}.mp3'
    video_info["title"] = video_title
    video_info["duration"] = duration

    ydl_opts.update({'outtmpl':video_info["path"]})

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
        
    return video_info

In [7]:
def get_transcript(url):
    video_info = get_tube(url)
    path = video_info["path"]
    path_audio =  f"/content/{path}"
    shortcut = path_audio[:-4]
    path_wav = f"{shortcut}.wav"

    sound = AudioSegment.from_file(path_audio)
    sound.export(path_wav, format="wav")

    #os.remove(path_audio) 

    audio, sr = librosa.load(path_wav, sr=16000)
    soundfile.write(path_wav, data = audio, samplerate = sr)
    
    transcript = make_transcript(path_wav)
    transcript = set_punctuation(transcript)
    video_info["transcript"] = transcript

    #os.remove(path_wav)

    return video_info

In [9]:
def keywords(transcript):
  rake_nltk_var = Rake()
  rake_nltk_var.extract_keywords_from_text(transcript)
  key_output = rake_nltk_var.get_ranked_phrases()
  #keyword_extracted = rake_nltk_var.get_ranked_phrases_with_scores()

  return key_output[:5]

In [10]:
def abstract_summary(transcript):
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    device = torch.device('cpu')

    tokenized_text = tokenizer.encode(transcript, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=4,
                                      no_repeat_ngram_size=2,
                                      min_length=50,
                                      max_length=125,
                                      early_stopping=True)

    abs_output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    abs_output = capital_letters(abs_output)

    return abs_output

In [14]:
def extractive_summary(transcript):
    model = Summarizer()
    result = model(transcript, min_length=50)
    ext_output = "".join(result)
    ext_output = capital_letters(ext_output)

    return ext_output

In [15]:
def assembling_url(url):
  video_info = get_transcript(url)
  transcript = video_info["transcript"]
  video_title = video_info["title"]
  video_duration = video_info["duration"]

  key_output = keywords(transcript)
  abs_output = abstract_summary(transcript)
  ext_output = extractive_summary(transcript)

  return video_info, abs_output, ext_output

In [None]:
print("HAHAHA")

In [16]:
import time

start = time.time()
assembling_url("https://www.youtube.com/watch?v=FhwNbG9HLhU")
end = time.time()

length = int(end - start)
min = int(length / 60)
sec = length % 60
if sec < 10:
  duration = f"{min}:0{sec}"
else:
  duration = f"{min}:{sec}"

print(duration)

[youtube] FhwNbG9HLhU: Downloading webpage
[youtube] FhwNbG9HLhU: Downloading webpage
[download] FhwNbG9HLhU.mp3 has already been downloaded
[download] 100% of 2.09MiB
[ffmpeg] Correcting container in "FhwNbG9HLhU.mp3"
[ffmpeg] Post-process file FhwNbG9HLhU.mp3 exists, skipping
[NeMo I 2021-11-30 12:12:29 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo.
[NeMo I 2021-11-30 12:12:29 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo
[NeMo I 2021-11-30 12:12:29 common:728] Instantiating model from pre-trained checkpoint
[NeMo I 2021-11-30 12:12:34 tokenizer_utils:123] Getting HuggingFace AutoTokenizer with pretrained_model_name: bert-base-uncased, vocab_file: /tmp/tmp02m7xbl5/tokenizer.vocab_file, special_tokens_dict: {}, and use_fast: False


Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
[NeMo W 2021-11-30 12:12:35 modelPT:131] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    text_file: text_train.txt
    labels_file: labels_train.txt
    shuffle: true
    num_samples: -1
    batch_size: 64
    
[NeMo W 2021-11-30 12:12:35 modelPT:138] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    ds_item: null
    text_file: text_dev.txt
    labels_file: labels_dev.txt
    shuffle: false
    num_samples: -1
    batch_size: 64
    
[NeMo W 2021-11-30 12:12:35 modelPT:144] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a vali

[NeMo I 2021-11-30 12:12:40 save_restore_connector:149] Model PunctuationCapitalizationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.5.0/punctuation_en_bert/93b0369b5e0d147f61895feffcbcfb88/punctuation_en_bert.nemo.
[NeMo I 2021-11-30 12:12:40 punctuation_capitalization_model:577] Using batch size 1 for inference
[NeMo I 2021-11-30 12:12:40 punctuation_capitalization_dataset:543] Max length: 64
[NeMo I 2021-11-30 12:12:40 data_preprocessing:358] Some stats of the lengths of the sequences:
[NeMo I 2021-11-30 12:12:40 data_preprocessing:364] Min: 282 |                  Max: 282 |                  Mean: 282.0 |                  Median: 282.0
[NeMo I 2021-11-30 12:12:40 data_preprocessing:366] 75 percentile: 282.00
[NeMo I 2021-11-30 12:12:40 data_preprocessing:367] 99 percentile: 282.00


100%|██████████| 29/29 [00:09<00:00,  3.20batch/s]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Title: Two men die as Storm Arwen brings 98mph gusts to UK - BBC News
Length: 2:16

Keywords: ['sureties filmed treacherous conditions near edinbro', 'one hundred thousand homes without power', 'harbourage seventy four thousand customers', 'twenty louis got stackin snow', 'storm cost damage across scotland']


2:56
