In [2]:
from pytube import YouTube
import moviepy.editor as mp
# import whisper
from youtube_transcript_api import YouTubeTranscriptApi
from difflib import SequenceMatcher
import re
import jiwer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
# give the url of the video you want to test on

video_url = 'https://www.youtube.com/watch?v=NiKtZgImdlY'

In [4]:
def download_youtube_video(url, output_path='video.mp4'):
    yt = YouTube(url)
    stream = yt.streams.filter(file_extension='mp4').first()
    stream.download(filename=output_path)
    return output_path
def extract_audio_from_video(video_path, audio_path='audio.mp3'):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path)
    return audio_path
# ddownload the video and extract the audio
video_path = download_youtube_video(video_url)
audio_path = extract_audio_from_video(video_path)


MoviePy - Writing audio in audio.mp3


                                                                      

MoviePy - Done.


In [5]:
def clean_transcript(transcript):
    # Remove anything in between brackets
    cleaned = re.sub(r'\[.*?\]', '', transcript)
    cleaned = re.sub(r'\(.*?\)', '', cleaned)
    cleaned = re.sub(r'\{.*?\}', '', cleaned)
    cleaned = re.sub(r'\<.*?\>', '', cleaned)
    # Remove special characters and punctuation
    cleaned = re.sub(r'[^A-Za-z0-9\s]', '', cleaned)
    # Remove extra spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)
    # Trim any leading/trailing spaces
    cleaned = cleaned.strip()
    return cleaned


In [6]:
import os
import time
import warnings
import json
# from faster_whisper import WhisperModel
# import whisper
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [7]:
def transcription(file):
    warnings.filterwarnings("ignore")
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    cache_dir = r'..\HF_cache'

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, _attn_implementation='flash_attention_2',
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        generate_kwargs={"language": "english"},
        model_kwargs={"attn_implementation": "flash_attention_2"},
        device=device
    )
    

    return pipe(file)

In [23]:
whisper_transcript = transcription(audio_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
whisper_transcript = whisper_transcript['text']

In [15]:
whisper_transcript = clean_transcript(whisper_transcript)

In [17]:
whisper_transcription = whisper_transcript

In [6]:
def transcribe_audio(audio_path):
    model = whisper.load_model("large-v3")
    result = model.transcribe(audio_path)
    return result['text']
# transcribe the audio using whisper, define the model you want to use
whisper_transcription = clean_transcript(transcribe_audio(audio_path))
print(whisper_transcription)


100%|█████████████████████████████████████| 2.88G/2.88G [02:18<00:00, 22.3MiB/s]


KeyboardInterrupt: 

In [9]:
def get_youtube_captions(id):
	transcript = YouTubeTranscriptApi.get_transcript(id)
	script = ""

	for text in transcript:
		t = text["text"]
		if t != '[Music]':
			script += t + " "

	return script, len(script.split())
def extract_video_id(url):
    pattern = r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&]+)'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None
# get the youtube captions
id = extract_video_id(video_url)
transcript, no_of_words = get_youtube_captions(id)
youtube_captions = clean_transcript(transcript)
print(youtube_captions)

Dr Martin Luther King Jr in a 1968 speech where he reflects upon the Civil Rights Movement states In the end we will remember not the words of our enemies but the silence of our friends As a teacher Ive internalized this message Every day all around us we see the consequences of silence manifest themselves in the form of discrimination violence genocide and war In the classroom I challenge my students to explore the silences in their own lives through poetry We work together to fill those spaces to recognize them to name them to understand that they dont have to be sources of shame In an effort to create a culture within my classroom where students feel safe sharing the intimacies of their own silences I have four core principles posted on the board that sits in the front of my class which every student signs at the beginning of the year read critically write consciously speak clearly tell your truth And I find myself thinking a lot about that last point tell your truth And I realized 

In [18]:
hypothesis = whisper_transcription
reference = youtube_captions
transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)
wer = jiwer.wer(
                reference,
                hypothesis,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
print(f"Word Error Rate (WER) :", wer)

Word Error Rate (WER) : 0.06666666666666667


In [19]:
def print_accuracy_from_wer(wer):
    accuracy = 1 - wer
    accuracy_percentage = accuracy * 100
    print(f"Accuracy: {accuracy_percentage:.2f}%")
print_accuracy_from_wer(wer)
# accoring to the word error rate

Accuracy: 93.33%


In [20]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

similarity = calculate_cosine_similarity(whisper_transcription, youtube_captions)

print(f"Cosine Similarity: {similarity:.2f}")


Cosine Similarity: 0.99


In [21]:
def calculate_bleu(reference, candidate):
    """
    Calculate the BLEU score for a candidate sentence given a reference sentence.

    Parameters:
    reference (str): The reference sentence.
    candidate (str): The candidate sentence.

    Returns:
    float: The BLEU score.
    """
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    
    # Smoothing function to handle cases where there are no matching n-grams
    smoothing_function = SmoothingFunction().method1
    
    bleu_score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing_function)
    return bleu_score

In [22]:
bleu_score = calculate_bleu(youtube_captions, whisper_transcription)
print(f"BLEU score: {bleu_score:.4f}")

BLEU score: 0.8973
