In [12]:
!apt-get install -y ffmpeg
!pip install git+https://github.com/openai/whisper.git
!pip install transformers
!pip install rouge_score
!pip install scipy

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-mg45ypdq
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-mg45ypdq
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [13]:
import os
import re
import zipfile
import nltk
import torch
import pandas as pd
import whisper
import matplotlib.pyplot as plt
import scipy.io
from sklearn.model_selection import KFold
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from rouge_score import rouge_scorer

In [14]:
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
dataset_path = '/content/ydata-tvsum50-video.zip'
extract_dir = '/content/dataset/video'


In [16]:
if not os.path.exists(extract_dir):
    print("Extracting dataset")
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

Extracting dataset


In [17]:
!pip install bert-extractive-summarizer

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-summarizer-0.10.1


In [18]:
# Import the bert summarizer
from summarizer import Summarizer
asr_model = whisper.load_model("base") # load whisper model that helps us in converting audio to transcript
bert_summarizer = Summarizer()
# we use rogue scorer to see the quality that we were able to generate in the summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 218MiB/s]


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
video_files = []
# to see if we got all the 50 video files
for root, dirs, files in os.walk(extract_dir):
    for fname in files:
        if fname.lower().endswith(('.mp4')):
            video_files.append(os.path.join(root, fname))
video_files.sort()
number_of_files = len(video_files)
print(f"We found {number_of_files} videos.")

We found 50 videos.


In [20]:
# Load the Whisper ASR model
model = whisper.load_model("base")
print("Whisper loaded")

# Loop through all the videos
transcripts = []
for idx, video_path in enumerate(video_files):
    print(f"Processing video {idx + 1})")
    # We transcribe all videos using whisper model that we loaded
    result = model.transcribe(video_path)
    text = result["text"]

    def clean_text(text):
        text = text.lower() #convert text to lower
        text = re.sub(r'\d', '#', text) #in place of digits have #
        text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text) # removing new lines
        text = re.sub(r'\r?\n', ' ', text) #remove all spaces
        text = re.sub(r' +', ' ', text)
        text = text.strip() # gets rid of spaces
        return text
    # We clean the text and save it
    cleaned_text = clean_text(text)
    transcripts.append(cleaned_text)


Whisper loaded
Processing video 1)
Processing video 2)
Processing video 3)
Processing video 4)
Processing video 5)
Processing video 6)
Processing video 7)
Processing video 8)
Processing video 9)
Processing video 10)
Processing video 11)
Processing video 12)
Processing video 13)
Processing video 14)
Processing video 15)
Processing video 16)
Processing video 17)
Processing video 18)
Processing video 19)
Processing video 20)
Processing video 21)
Processing video 22)
Processing video 23)
Processing video 24)
Processing video 25)
Processing video 26)
Processing video 27)
Processing video 28)
Processing video 29)
Processing video 30)
Processing video 31)
Processing video 32)
Processing video 33)
Processing video 34)
Processing video 35)
Processing video 36)
Processing video 37)
Processing video 38)
Processing video 39)
Processing video 40)
Processing video 41)
Processing video 42)
Processing video 43)
Processing video 44)
Processing video 45)
Processing video 46)
Processing video 47)
Process

In [21]:
# initlize the model to summarize
bert_model = Summarizer()
bert_summaries = [] # Store all the bert summaries
# We then generate a summary with 0.1 ratio and max length of 100
for text in transcripts:
    summary = bert_model(text, ratio = 0.1, min_length=10, max_length=100)
    bert_summaries.append(summary)

In [22]:
print("The first BERT summary")
print(bert_summaries[0])

The first BERT summary
hey mephy, do you know what this is? what's that good to you? i don't believe it. hey, you eat that dog food. just to let you guys know that they're watching this, our cat is really picky. it was fun for us, please.


In [23]:
from transformers import pipeline
# Initilize the BART summarizer
bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
# To store all summaries
bart_summaries = []
index = 0
# loop through to generate a summary
for text in transcripts:
    print(f"Summary generating for video {index + 1}")
    index += 1
    try:
        if len(text.split()) < 30:
            summary = text.strip()
        else:
            summary = bart_summarizer(text, min_length=10, max_length=100, do_sample=False)[0]['summary_text']
    except:
        print("Error generating summary")

    bart_summaries.append(summary)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Summary generating for video 1
Summary generating for video 2
Summary generating for video 3
Summary generating for video 4
Summary generating for video 5
Summary generating for video 6
Summary generating for video 7
Summary generating for video 8


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error generating summary
Summary generating for video 9
Error generating summary
Summary generating for video 10
Error generating summary
Summary generating for video 11
Error generating summary
Summary generating for video 12
Error generating summary
Summary generating for video 13
Error generating summary
Summary generating for video 14
Error generating summary
Summary generating for video 15
Error generating summary
Summary generating for video 16
Error generating summary
Summary generating for video 17
Error generating summary
Summary generating for video 18
Error generating summary
Summary generating for video 19
Error generating summary
Summary generating for video 20
Error generating summary
Summary generating for video 21
Error generating summary
Summary generating for video 22
Error generating summary
Summary generating for video 23
Error generating summary
Summary generating for video 24
Error generating summary
Summary generating for video 25
Error generating summary
Summary

In [29]:
import numpy as np
from sklearn.model_selection import KFold
from rouge_score import rouge_scorer

In [25]:
# We initialize the rogue scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bert_scores = []
bart_scores = []
transcript_size = len(transcripts)
for i in range(transcript_size):
  text = transcripts[i]
  bert_summary = bert_summaries[i]
  bart_summary = bart_summaries[i]
  bert_score = scorer.score(text, bert_summary)
  bart_score = scorer.score(text, bart_summary)

  bert_scores.append([bert_score['rouge1'].fmeasure, bert_score['rouge2'].fmeasure, bert_score['rougeL'].fmeasure])
  bart_scores.append([bart_score['rouge1'].fmeasure, bart_score['rouge2'].fmeasure, bart_score['rougeL'].fmeasure])

bert_scores = np.array(bert_scores)
bart_scores = np.array(bart_scores)

bert_averagescore = np.mean(bert_scores, axis=0)
bart_averagescore = np.mean(bart_scores, axis=0)

print("BERT score on average for 50 videos:")
print("ROUGE-1:", bert_averagescore[0])
print("ROUGE-2:", bert_averagescore[1])
print("ROUGE-L:", bert_averagescore[2])

print("\nBART score on average for 50 videos:")
print("ROUGE-1:", bart_averagescore[0])
print("ROUGE-2:", bart_averagescore[1])
print("ROUGE-L:", bart_averagescore[2])



BERT score on average for 50 videos:
ROUGE-1: 0.14296266763410828
ROUGE-2: 0.13100714254792048
ROUGE-L: 0.14296266763410828

BART score on average for 50 videos:
ROUGE-1: 0.12677738744363448
ROUGE-2: 0.05461116831687006
ROUGE-L: 0.11347578141387346


In [30]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# we split the scores we got into 10 different 10 splits to see in different parts of our dataset how well the models did
fold_results = []
for fold, (_, test_index) in enumerate(kf.split(video_files), 1):

    fold_bert_avg = bert_scores[test_index].mean(axis=0)
    fold_bart_avg = bart_scores[test_index].mean(axis=0)
    fold_results.append((fold, fold_bert_avg, fold_bart_avg))
    print(f"Fold {fold}:")
    print("BERT Score:")
    print("ROUGE-1:", fold_bert_avg[0])
    print("ROUGE-2:", fold_bert_avg[1])
    print("ROUGE-L:", fold_bert_avg[2])
    print("BART Score:")
    print("ROUGE-1:", fold_bart_avg[0])
    print("ROUGE-2:", fold_bart_avg[1])
    print("ROUGE-L:", fold_bart_avg[2])


Fold 1:
BERT Score:
ROUGE-1: 0.1584089830674459
ROUGE-2: 0.14525539122071615
ROUGE-L: 0.1584089830674459
BART Score:
ROUGE-1: 0.03842921928700709
ROUGE-2: 0.007244754441861133
ROUGE-L: 0.025978484127468327
Fold 2:
BERT Score:
ROUGE-1: 0.22128985221606867
ROUGE-2: 0.2076619806375946
ROUGE-L: 0.22128985221606867
BART Score:
ROUGE-1: 0.2404788645685144
ROUGE-2: 0.005698177791201047
ROUGE-L: 0.22539912313541483
Fold 3:
BERT Score:
ROUGE-1: 0.1399557399899888
ROUGE-2: 0.126094051524974
ROUGE-L: 0.1399557399899888
BART Score:
ROUGE-1: 0.11823130000761355
ROUGE-2: 0.08322716390512516
ROUGE-L: 0.10526228851679313
Fold 4:
BERT Score:
ROUGE-1: 0.13751385443051362
ROUGE-2: 0.12882248373669605
ROUGE-L: 0.13751385443051362
BART Score:
ROUGE-1: 0.24757368045627234
ROUGE-2: 0.038143961780929866
ROUGE-L: 0.24568540350898677
Fold 5:
BERT Score:
ROUGE-1: 0.12180328275176114
ROUGE-2: 0.10881742466255322
ROUGE-L: 0.12180328275176114
BART Score:
ROUGE-1: 0.05468630786939889
ROUGE-2: 0.005861578422429974
RO