In [1]:
!pip install whisper_timestamped librosa numpy
!sudo apt-get install ffmpeg
!pip install -q pydub ffmpeg

Collecting whisper_timestamped
  Downloading whisper_timestamped-1.15.8-py3-none-any.whl.metadata (1.2 kB)
Collecting dtw-python (from whisper_timestamped)
  Downloading dtw_python-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai-whisper (from whisper_timestamped)
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper->whisper_timestamped)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pydub import AudioSegment

def convert_to_wav(input_path, output_path):
    """
    Converts an audio file (MP3/MP4) to WAV format.

    Parameters:
        input_path (str): Path to the input audio/video file.
        output_path (str): Path to save the converted WAV file.
    """
    try:
        # Load audio/video file
        audio = AudioSegment.from_file(input_path)

        # Export as WAV
        audio.export(output_path, format="wav")
        print(f"Converted successfully: {output_path}")
    except Exception as e:
        print(f"Error during conversion: {e}")

# Example usage
input_file = "/content/drive/MyDrive/Hackathon/sample.mp4"  # Update with your actual file path
output_file = "/content/drive/MyDrive/Hackathon/sample.wav"
convert_to_wav(input_file, output_file)


Converted successfully: /content/drive/MyDrive/Hackathon/sample.wav


In [4]:
import whisper_timestamped as whisper
import librosa
import numpy as np
import torch

def preprocess_audio(file_path):
    """Preprocess audio for optimal Whisper performance"""
    # Load audio with resampling to 16kHz
    audio, sr = librosa.load(file_path, sr=16000)

    # Noise reduction using spectral gating (using preemphasis as a proxy)
    audio_clean = librosa.effects.preemphasis(audio)

    # Normalize audio to -3dB peak
    audio_norm = librosa.util.normalize(audio_clean) * 10**(-3/20)

    # Trim leading/trailing silence
    audio_trimmed, _ = librosa.effects.trim(audio_norm, top_db=20)

    return audio_trimmed, sr

def transcribe_with_timestamps(audio_path):
    """Transcribe audio with word-level timestamps"""
    # Load the Whisper model with appropriate device selection
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = whisper.load_model("small", device=device)

    # Preprocess audio file
    audio_array, sr = preprocess_audio(audio_path)

    # Define transcription parameters
    transcription_params = {
        "language": "en",
        "beam_size": 5,
        "best_of": 5,
        "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
        "detect_disfluencies": True,
        "vad": True  # Voice activity detection
    }

    # Transcribe with advanced timestamp configuration
    result = whisper.transcribe(
        model,
        audio_array,
        **transcription_params
    )

    # Save the transcription parameters to a file called "params.txt"
    save_params(transcription_params, "/content/drive/My Drive/Hackathon/params.txt")

    return result

def ts_to_srt(seconds):
    """Convert seconds to SRT timestamp format"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:06.3f}".replace(".", ",")

def save_transcription(result, output_format="srt", output_path="transcription.srt"):
    """Save transcription in specified format
       - 'srt' : Save with timestamps in SRT format.
       - 'txt' : Save only the plain text transcript (without timestamps).
    """
    if output_format == "srt":
        with open(output_path, "w") as f:
            for segment in result["segments"]:
                start = segment["start"]
                text = segment["text"].strip()
                # Write SRT line in the format: [HH:MM:SS] Subtitle text
                f.write(f"[{ts_to_srt(start)}] {text}\n")
    elif output_format == "txt":
        with open(output_path, "w") as f:
            for segment in result["segments"]:
                text = segment["text"].strip()
                f.write(f"{text}\n")

def save_params(params, output_path="params.txt"):
    """Save the transcription parameters to a file"""
    with open(output_path, "w") as f:
        for key, value in params.items():
            f.write(f"{key}: {value}\n")

if __name__ == "__main__":
    # Set your audio file path here (ensure it exists)
    audio_file = output_file  # Replace with your file path or use Colab file upload

    # 1. Preprocess and transcribe
    transcription = transcribe_with_timestamps(audio_file)

    # 2. Save results in SRT and plain text formats to the desired paths.
    save_transcription(transcription, output_format="srt", output_path="/content/drive/My Drive/Hackathon/transcription.srt")
    save_transcription(transcription, output_format="txt", output_path="/content/drive/My Drive/Hackathon/transcription.txt")

    print("Transcription complete with word-level timestamps and plain text output")


Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 109MiB/s]
  checkpoint = torch.load(fp, map_location=device)
Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip
100%|██████████| 56887/56887 [01:38<00:00, 576.62frames/s]


Transcription complete with word-level timestamps and plain text output


In [5]:
import re
from transformers import pipeline

In [6]:
def parse_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into subtitle blocks
    subtitle_blocks = re.split(r'\n\n', content.strip())

    # Extract text from each subtitle block
    texts = []
    for block in subtitle_blocks:
        lines = block.split('\n')
        if len(lines) >= 3:
            text = ' '.join(lines[2:])
            texts.append(text)

    return ' '.join(texts)

# Load the transcript file from your Google Drive
transcript_file = '/content/drive/MyDrive/Hackathon/transcription.srt'  # Replace with your file path
full_text = parse_srt(transcript_file)

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Split the text into chunks of 1024 tokens (adjust as needed)
max_chunk_length = 1024
chunks = [full_text[i:i+max_chunk_length] for i in range(0, len(full_text), max_chunk_length)]

# Summarize each chunk
summaries = []
for chunk in chunks:
    summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Combine the summaries
final_summary = ' '.join(summaries)

print("Summary of the transcript:")
print(final_summary)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Summary of the transcript:
CNN's John Defterios talks to a Canadian man about what he searches for on Google. He reveals that he's never been contacted by a government agency about anything. He also reveals that Canada is not as strict as the U.S. on drug laws. YouTube has this kind of weird ecosystem where stuff that channels that can only exist in a certain country. And not in others. Like how there's a bunch of gun content on for America. And it's like, no other country could ev Canada is one of the few places in the world that you could have like a proper, [00:01:21,840] and there is the explosions in fire. The Australian guy. The government thought I was making meth. When you do research, people call it like, [00:02:05,620] it's not really a term that people use, I guess. So usually sub a gram, right? So like maybe up to five grams. "I don't make too much to make, let's say 10 grams. I need to work with like three liters. So you're no longer in the one liter beaker. You're kind of