Step: 1 -> Install The Dependencies from the requirements.txt or Run the Below Cell to install all the Dependencies

In [None]:
!pip install transformers git+https://github.com/openai/whisper.git langdetect moviepy rouge-score pytube

Step: 2 ->  Run the Below Cell

It take Take any language as Input and Give Transcript and translate the Transcript to English and Generate Summary of the Audio/Video

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from langdetect import detect, LangDetectException
from moviepy.editor import VideoFileClip
from rouge_score import rouge_scorer
import time
from pytube import YouTube

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Loading Models
whisper_model_id = "openai/whisper-medium"  # Whisper model
rag_model_name = "google/flan-t5-base"

whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
).to(device)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

rag_model = AutoModelForSeq2SeqLM.from_pretrained(rag_model_name).to(device)
rag_tokenizer = AutoTokenizer.from_pretrained(rag_model_name, add_special_tokens=True)

# Pipelines
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=whisper_processor.tokenizer,
    feature_extractor=whisper_processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=60, 
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

rag_pipeline = pipeline(
    "text2text-generation",
    model=rag_model,
    tokenizer=rag_tokenizer,
    device=device,
)

# Rouge for Summarization Evaluation
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def chunk_text(text, max_length=500):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(" ".join(current_chunk + [word])) <= max_length:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def translate_chunks(chunks):
    translated_text = ""
    for chunk in chunks:
        translation = rag_pipeline(f"Translate the following text to English: {chunk}", max_length=300, num_beams=5)  
        translated_chunk = translation[0]['generated_text']
        translated_text += " " + translated_chunk
    return translated_text.strip()

def process_media_file(media_file):
    try:
        # Extract audio from video if needed
        if media_file.lower().endswith(('.mp4', '.avi', '.mov')):
            video_clip = VideoFileClip(media_file)
            audio_file = "extracted_audio.wav"
            video_clip.audio.write_audiofile(audio_file)
        else:
            audio_file = media_file

        # Speech Recognition 
        start_time = time.time() 
        result = whisper_pipe(audio_file)  
        end_time = time.time()
        print(f"Speech Recognition Time: {end_time - start_time:.2f} seconds")
        text = result["text"]
        print(f"Transcribed Text: {text}")

        # Language Detection with Error Handling
        try:
            lang = detect(text)
            print(f"Detected Language: {lang}")
        except LangDetectException:
            lang = "unknown"
            print(f"Language could not be detected.")

        # Translation using RAG with chunking
        chunks = chunk_text(text, max_length=200)
        translation_text = translate_chunks(chunks)

        # Summarization using RAG 
        start_time = time.time()
        summary = rag_pipeline(f"Give the precise Summary of: {translation_text}", max_length=300, num_beams=5)    
        end_time = time.time()
        print(f"Summarization Time: {end_time - start_time:.2f} seconds")
        summary_text = summary[0]['generated_text']

        # Evaluating Summarization 
        scores = rouge.score(summary_text, translation_text)  # Evaluate against translated text
        print("Summarization Evaluation:")
        for key, value in scores.items():
            print(f"{key}: {value.fmeasure:.2f}")

        return {
            "original_text": text,
            "translation": translation_text,  
            "summary": summary_text,
        }

    except Exception as e:
        print(f"Error processing file: {e}")
        return None  

def process_youtube_link(youtube_link):
    try:
        # Download the video
        yt = YouTube(youtube_link)
        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        stream.download(filename='downloaded_video.mp4')

        # Extract audio from the downloaded video
        video_clip = VideoFileClip('downloaded_video.mp4')
        audio_file = "extracted_audio.wav"
        video_clip.audio.write_audiofile(audio_file)

        return process_media_file(audio_file)  

    except Exception as e:
        print(f"Error processing YouTube link: {e}")
        return None

if __name__ == "__main__":
    while True:
        media_input = input("Enter YouTube link or local file path: ")
        if media_input.startswith('https://www.youtube.com'):
            results = process_youtube_link(media_input)
        else:
            results = process_media_file(media_input)

        if results is not None:
            print("----------------------------------")
            print("Results:")
            print("----------------------------------")
            print(f"Translation: {results['translation']}")  
            print("----------------------------------")
            print(f"Summary: {results['summary']}")
        
        continue_processing = input("Process another file? (y/n): ")
        if continue_processing.lower() != 'y':
            break



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Enter YouTube link or local file path:  /kaggle/input/french-audio/french.wav
Speech Recognition Time: 1.94 seconds

Transcribed Text:  Parcourant des routes de campagne bueuse et s'adressant jour après jour à des auditoires humides dans des salles de classe pleine de courant d'air pendant 15 jours, il devra se présenter dans un lieu de culte le dimanche matin et pourra venir chez nous immédiatement après.

Detected Language: fr

Summarization Time: 1.31 seconds

Summarization Evaluation:

rouge1: 0.81

rouge2: 0.80

rougeL: 0.81

----------------------------------

Results:

----------------------------------

Translation: Located on a blustery field and scurrying day after day to humid auditoriums in class rooms full of air for 15 days, he must be present in a place. de culte le dimanche matin et peut venir chez nous immédiatement après.

----------------------------------

Summary: Located on a blustery field and scurrying day after day to humid auditoriums in class rooms full of ai

                                                                      

MoviePy - Done.

Speech Recognition Time: 5.21 seconds

Transcribed Text:  El pájaro y la ballena Una historia original de TheFableCottage.com Una vez, hubo un pájaro que amaba a una ballena y una ballena que amaba a un pájaro. Al pájaro le encantaba la hermosa sonrisa de la ballena. Le encantaba cómo nadaba elegantemente por el agua. El pájaro y la ballena se encontraron en la bahía. Hablaron y hablaron. Hablaron de la luna, de las olas y de los barcos en el océano. El pájaro contó chistes que hicieron reír a la ballena. La ballena cantó hermosas canciones que hicieron solo porque un pájaro y una ballena se enamoran. El verano se transformó en otoño y el otoño se transformó en invierno. El océano se volvió frío y todas las ballenas partieron hacia aguas más cálidas. Ven conmigo donde hay aguas cálidas, dijo la ballena. pero primero enséñame a ser una ballena. Así dijo la ballena. Sígueme. Y se sumergió en el agua. De acuerdo dijo el pájaro. Mejor ven conmigo, vivo arriba en los encant

Token indices sequence length is longer than the specified maximum sequence length for this model (577 > 512). Running this sequence through the model will result in indexing errors


Summarization Time: 6.41 seconds

Summarization Evaluation:

rouge1: 0.66

rouge2: 0.66

rougeL: 0.66

----------------------------------

Results:

----------------------------------

Translation: El pájaro y la ballena An original story from TheFableCottage.com Once, there was a pájaro that amaba to a ballena and a ballena that amaba to a pájaro. Al pájaro le encantaba la hermosa sonrisa de la ballena. pájaro y la ballena encontraron en la baa. Hablaron y hablaron. Hablaron de la luna, de las olas y de los barcos en el océano. pájaro contó chistes que hicieron rer a la ballena. La ballena cantó her beautiful canciones que hicieron solo porque un pájaro y una ballena se enamoran. El verano se transformó en otoo y el otoo se transformó. transformed into winter. The ocean turned cold and all the birds partied towards the most cálid waters. See me where there are cálid waters, said the bird. but first I would like to be a bird. So said the ballerina. Sing me. And he poured in the water. 

                                                                    

MoviePy - Done.

Speech Recognition Time: 2.47 seconds

Transcribed Text:  Hey there, this is a quick and silly video to allow you to experiment a little bit with the process of transcription on YouTube. All I'm looking for you to do here is to use the YouTube tool to transcribe this message and then click Sync and set the timing so you can get a quick idea about how the whole process works. Well, this wraps up the video. Good luck and I will talk to you about it soon.

Detected Language: en

Summarization Time: 1.87 seconds

Summarization Evaluation:

rouge1: 0.89

rouge2: 0.89

rougeL: 0.89

----------------------------------

Results:

----------------------------------

Translation: Hi there, this is a quick and silly video to allow you to experiment a little bit with the process of transcription on YouTube. All I'm looking for you to do here is to use the YouTube tool to transcribe this message and then click Sync and set the timing so you can get a quick idea about how the whole 

Below Cell Takes any language as Input and Give Transcript and translate the Transcript to French and Generate Summary of the Audio/Video

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from langdetect import detect, LangDetectException
from moviepy.editor import VideoFileClip
from rouge_score import rouge_scorer
import time
from pytube import YouTube  

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Loading Models
whisper_model_id = "openai/whisper-large-v3"  # Whisper model
rag_model_name = "google/flan-t5-large" 

whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
).to(device)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

rag_model = AutoModelForSeq2SeqLM.from_pretrained(rag_model_name).to(device)

rag_tokenizer = AutoTokenizer.from_pretrained(rag_model_name, add_special_tokens=True)

# Pipelines
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=whisper_processor.tokenizer,
    feature_extractor=whisper_processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=60, 
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

rag_pipeline = pipeline(
    "text2text-generation",
    model=rag_model,
    tokenizer=rag_tokenizer,
    device=device,
)

# Rouge for Summarization Evaluation
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def process_media_file(media_file):
    try:
        # Extract audio from video if needed
        if media_file.lower().endswith(('.mp4', '.avi', '.mov')):
            video_clip = VideoFileClip(media_file)
            audio_file = "extracted_audio.wav"
            video_clip.audio.write_audiofile(audio_file)
        else:
            audio_file = media_file

        # Speech Recognition 
        start_time = time.time() 
        result = whisper_pipe(audio_file)  
        end_time = time.time()
        print(f"Speech Recognition Time: {end_time - start_time:.2f} seconds")
        text = result["text"]
        print(f"Transcribed Text: {text}")

        # Language Detection with Error Handling
        try:
            lang = detect(text)
            print(f"Detected Language: {lang}")
        except LangDetectException:
            lang = "unknown"
            print(f"Language could not be detected.")

        # Translation using RAG 
        start_time = time.time()
        translation = rag_pipeline(f"Translate the Following to French: {text}", max_length=100, num_beams=5)  
        end_time = time.time()
        print(f"Translation Time: {end_time - start_time:.2f} seconds")
        translation_text = translation[0]['generated_text']
        # Filter special tokens from translation 
        translation_text = translation_text.replace("<extra_id_0>", "").replace("<extra_id_1>", "").replace("<extra_id_2>", "")  # Add more replacements as needed

        # Summarization using RAG 
        start_time = time.time()
        summary = rag_pipeline(f"Summarize it precisely: {translation_text}", max_length=100, num_beams=5)    
        end_time = time.time()
        print(f"Summarization Time: {end_time - start_time:.2f} seconds")
        summary_text = summary[0]['generated_text']

        # Evaluating Summarization 
        scores = rouge.score(summary_text, translation_text)  # Evaluate against translated text
        print("Summarization Evaluation:")
        for key, value in scores.items():
            print(f"{key}: {value.fmeasure:.2f}")

        return {
            "original_text": text,
            "translation": translation_text,  
            "summary": summary_text,
        }

    except Exception as e:
        print(f"Error processing file: {e}")
        return None  

def process_youtube_link(youtube_link):
    try:
        # Download the video
        yt = YouTube(youtube_link)
        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        stream.download(filename='downloaded_video.mp4')

        # Extract audio from the downloaded video
        video_clip = VideoFileClip('downloaded_video.mp4')
        audio_file = "extracted_audio.wav"
        video_clip.audio.write_audiofile(audio_file)

        return process_media_file(audio_file)  

    except Exception as e:
        print(f"Error processing YouTube link: {e}")
        return None

if __name__ == "__main__":
    while True:
        media_input = input("Enter YouTube link or local file path: ")
        if media_input.startswith('https://www.youtube.com'):
            results = process_youtube_link(media_input)
        else:
            results = process_media_file(media_input)

        if results is not None:
            print("----------------------------------")
            print("Results:")
            print("----------------------------------")
            print(f"Translation: {results['translation']}")  
            print("----------------------------------")
            print(f"Summary: {results['summary']}")
        
        continue_processing = input("Process another file? (y/n): ")
        if continue_processing.lower() != 'y':
            break


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Enter YouTube link or local file path:  /kaggle/input/audio/sample.mp3
Speech Recognition Time: 1.91 seconds

Transcribed Text:  Going along slushy country roads and speaking to damp audiences in drafty school rooms day after day for a fortnight, he will have to put in an appearance at some place of worship on Sunday morning and he can come to us immediately afterwards.

Detected Language: en

Translation Time: 3.63 seconds

Summarization Time: 3.53 seconds

Summarization Evaluation:

rouge1: 0.97

rouge2: 0.95

rougeL: 0.97

----------------------------------

Results:

----------------------------------

Translation: Pour une semaine, il y aura lieu à travers des rues rurales et à parler à des audiences humides dans des salles d'école à l'insuffisance, et il y aura lieu à s'attendre à l'ouverture d'un église et à s'attendre à nous.

----------------------------------

Summary: Pour une semaine, il y aura lieu à travers des rues rurales et à parler à des audiences humides dans des sal