<a href="https://colab.research.google.com/github/SG7504/GFGKIIT-Wanderers-Hyperthon/blob/main/HyperThonFINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydub
!pip install torch torchvision torchaudio
!pip install transformers
!pip install openai-whisper
!pip install webrtcvad
!pip install noisereduce
!pip install srt
!pip install librosa
!apt-get install ffmpeg
!pip install scipy

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: open

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
from pydub import AudioSegment
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import whisper
import srt
from datetime import timedelta
import webrtcvad
import wave

def preprocess_audio_for_vad(input_audio_path, output_audio_path):
    """
    Preprocess audio for WebRTC VAD:
    - Convert to mono, 16-bit PCM, 16 kHz WAV format
    - Extract speech using WebRTC VAD
    """
    print("Preprocessing audio for VAD...")

    # Load and convert audio to mono and 16 kHz
    audio = AudioSegment.from_file(input_audio_path)
    audio = audio.set_channels(1)  # Mono
    audio = audio.set_frame_rate(16000)  # 16 kHz

    # Export audio to a temporary WAV file
    temp_wav_path = "temp_audio.wav"
    audio.export(temp_wav_path, format="wav")

    # Read the WAV file and process it with WebRTC VAD
    vad = webrtcvad.Vad()
    vad.set_mode(2)  # Balanced mode for speech detection

    with wave.open(temp_wav_path, "rb") as wf:
        # Check audio format
        assert wf.getnchannels() == 1, "Audio must be mono"
        assert wf.getsampwidth() == 2, "Audio must be 16-bit PCM"
        assert wf.getframerate() == 16000, "Audio must have 16 kHz sample rate"

        frame_duration = 30  # ms (10, 20, or 30 ms frames supported)
        frame_size = int(wf.getframerate() * (frame_duration / 1000)) * wf.getsampwidth()

        speech_frames = []
        while True:
            frames = wf.readframes(frame_size // wf.getsampwidth())
            if len(frames) < frame_size:
                break  # End of file

            # Use WebRTC VAD to detect speech
            is_speech = vad.is_speech(frames, wf.getframerate())
            if is_speech:
                speech_frames.append(frames)

    # Combine speech frames into a new audio file
    with wave.open(output_audio_path, "wb") as out_wf:
        out_wf.setnchannels(1)
        out_wf.setsampwidth(2)
        out_wf.setframerate(16000)
        out_wf.writeframes(b"".join(speech_frames))

    print(f"Preprocessed audio saved to: {output_audio_path}")


# Transcribe audio to text using Whisper
def transcribe_audio_whisper(audio_path, whisper_model):
    print("Transcribing audio...")
    result = whisper_model.transcribe(audio_path, language="ja")
    print("Transcription completed.")
    return result["segments"]


# Translate text using M2M-100
def translate_text_m2m100(text, m2m_model, m2m_tokenizer, src_lang="ja", tgt_lang="en"):
    m2m_tokenizer.src_lang = src_lang
    encoded = m2m_tokenizer(text, return_tensors="pt")
    generated_tokens = m2m_model.generate(
        **encoded,
        forced_bos_token_id=m2m_tokenizer.lang_code_to_id[tgt_lang],
        max_length=512
    )
    return m2m_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


# Create an SRT file from transcription and translation
def create_srt(transcriptions, model, tokenizer):
    subtitles = []
    for segment in transcriptions:
        start = timedelta(seconds=segment["start"])
        end = timedelta(seconds=segment["end"])
        japanese_text = segment["text"]
        english_translation = translate_text_m2m100(japanese_text, model, tokenizer)

        subtitle = srt.Subtitle(
            index=len(subtitles) + 1,
            start=start,
            end=end,
            content=f"{english_translation}\n({japanese_text})",
        )
        subtitles.append(subtitle)
    return subtitles


# Main function
def main(input_video_path, output_srt_path, temp_audio_path="temp_audio.wav"):
    # Load Whisper and M2M-100 models
    print("Loading models...")
    whisper_model = whisper.load_model("large")  # Use Whisper large model for better accuracy
    m2m_model_name = "facebook/m2m100_418M"
    m2m_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name)
    m2m_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name)
    print("Models loaded successfully.")

    # Preprocess audio
    preprocess_audio_for_vad(input_video_path, temp_audio_path)

    # Transcribe audio
    transcriptions = transcribe_audio_whisper(temp_audio_path, whisper_model)

    # Create subtitles
    print("Creating SRT file...")
    subtitles = create_srt(transcriptions, m2m_model, m2m_tokenizer)
    with open(output_srt_path, "w", encoding="utf-8") as f:
        f.write(srt.compose(subtitles))
    print(f"Subtitle file created: {output_srt_path}")

    # Cleanup temporary files
    if os.path.exists(temp_audio_path):
        os.remove(temp_audio_path)


# Run the pipeline
if __name__ == "__main__":
    VIDEO_PATH = "/content/drive/MyDrive/Hyperthon/test.mp4"  # Replace with your actual file path
    OUTPUT_SRT_PATH = "/content/drive/MyDrive/Hyperthon/outputNew2.srt"  # Replace with the desired output path

    main(VIDEO_PATH, OUTPUT_SRT_PATH)


Loading models...


100%|█████████████████████████████████████| 2.88G/2.88G [00:36<00:00, 84.3MiB/s]
  checkpoint = torch.load(fp, map_location=device)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Models loaded successfully.
Preprocessing audio for VAD...
Preprocessed audio saved to: temp_audio.wav
Transcribing audio...
Transcription completed.
Creating SRT file...
Subtitle file created: /content/drive/MyDrive/Hyperthon/outputNew2.srt
