In [None]:
import subprocess
subprocess.check_call(['pip', 'install', 'moviepy', 'demucs', 'pydub', 'dotenv', 'openai', 'noisereduce', 'pyannote.audio', 'openai-whisper'],
                      stdout=subprocess.DEVNULL,
                      stderr=subprocess.DEVNULL)

0

In [None]:
import subprocess

subprocess.run(['apt-get', 'update'], env={'LC_ALL': 'C.UTF-8'})
subprocess.run(['apt-get', 'install', '-y', 'espeak'], env={'LC_ALL': 'C.UTF-8'})

CompletedProcess(args=['apt-get', 'install', '-y', 'espeak'], returncode=0)

In [None]:
# Video processing and editing libraries
import cv2
from moviepy.editor import VideoFileClip, ImageSequenceClip, AudioClip, concatenate_videoclips, ColorClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

# Audio processing libraries
import torch
import torchaudio
import numpy as np
from pydub import AudioSegment
import soundfile as sf

# Models and audio source separation / ASR
import whisper
from demucs.pretrained import get_model
from demucs.apply import apply_model

# Speech analysis and forced alignment
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook

# OpenAI API and environment variable handling
from openai import OpenAI
from dotenv import load_dotenv
import json
import os

# Memory management
import gc

# Ignore Warning
import warnings
warnings.filterwarnings("ignore")
import librosa

  if event.key is 'enter':

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


# Audio

In [None]:
def get_normalized_audio(video_path):
    audio_segment = AudioSegment.from_file(video_path, format="mp4")
    audio_samples = np.array(audio_segment.get_array_of_samples())

    if audio_segment.channels == 2:
        audio_samples = audio_samples.reshape((-1, 2))

    sample_width = audio_segment.sample_width
    max_val = float(2 ** (8 * sample_width - 1))  # 2^(16-1)=32768
    audio_float = audio_samples.astype(np.float32) / max_val

    del audio_samples, audio_segment
    gc.collect()

    return audio_float

video_path = '''/content/Why you little ( Bart ).mp4'''
audio = get_normalized_audio(video_path)

# Demucs

In [None]:
def separate_vocals_and_background(audio, device):
    audio_tensor = torch.tensor(audio.T).float().unsqueeze(0).to(device)
    model = get_model('htdemucs').to(device)
    model.eval()
    with torch.no_grad():
        estimates = apply_model(model, audio_tensor, shifts=1, overlap=0.5)

    estimates_np = estimates.squeeze(0).cpu().numpy()

    vocals = estimates_np[3].T
    background = (estimates_np[0] + estimates_np[1] + estimates_np[2]).T

    del audio_tensor, estimates, estimates_np, model
    torch.cuda.empty_cache()
    gc.collect()

    return vocals, background

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocals, background = separate_vocals_and_background(audio, device)

# Save Audio

In [None]:
vocals_file_root = "/content/vocals.wav"
background_file_root = "/content/background.wav"

sf.write(vocals_file_root, vocals, 44100)
sf.write(background_file_root, background, 44100)

del vocals, audio
gc.collect()

0

# Speech-To-Text

In [None]:
def speech_to_text_with_vad(audio_file, device='cuda'):
    with torch.no_grad():
      model = whisper.load_model("medium", device=device)
      vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token='')
      vad_pipeline.to(torch.device("cuda"))
      result = model.transcribe(audio_file, word_timestamps=True)

    vad_output = vad_pipeline(audio_file)

    refined_segments = []
    for segment in result['segments']:
        vad_matches = [
            (speech.start, speech.end)
            for speech in vad_output.get_timeline()
            if (speech.start <= segment['end'] and speech.end >= segment['start'])
        ]

        if vad_matches:
            best_match = min(vad_matches, key=lambda x: x[1] - x[0])
            refined_segments.append({
                'start': max(segment['start'], best_match[0]),
                'end': min(segment['end'], best_match[1]),
                'text': segment['text'].strip()
            })
        else:
            refined_segments.append(segment)

    output_string = ""
    for segment in refined_segments:
        output_string += f"[{segment['start']:05.2f} ~ {segment['end']:05.2f}] {segment['text']}\n"

    del model, vad_pipeline, result, vad_output
    torch.cuda.empty_cache()
    gc.collect()

    return {
        'segments': refined_segments,
        'output_string': output_string
    }

result = speech_to_text_with_vad(vocals_file_root, 'cuda')

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.
[00.35 ~ 02.48] You're welcome to watch anything you want on TV.
[03.08 ~ 03.78] TV sucks.
[04.54 ~ 07.79] I know you're upset right now, so I'll pretend you didn't say that.
[09.44 ~ 09.98] You little...
[09.98 ~ 10.04] Yow!
[12.45 ~ 13.38] You little...


# Split Audio and Excel Merged

In [None]:
import os
import pandas as pd
import librosa
import soundfile as sf
from pydub import AudioSegment

suffix = 'M'

def split_audio_by_segments(audio_file, segments, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    base_filename = os.path.basename(audio_file)
    file_name_without_ext = os.path.splitext(base_filename)[0]

    try:
        audio = AudioSegment.from_file(audio_file)
        use_pydub = True
    except:
        audio_data, sample_rate = librosa.load(audio_file, sr=None)
        use_pydub = False

    csv_data = []

    for i, segment in enumerate(segments):
        start_time = segment['start']
        end_time = segment['end']
        text = segment['text']

        segment_filename = f"{file_name_without_ext}_{i+1:03d}_{suffix}.wav"
        segment_path = os.path.join(output_dir, segment_filename)

        if use_pydub:
            start_ms = int(start_time * 1000)
            end_ms = int(end_time * 1000)
            segment_audio = audio[start_ms:end_ms]
            segment_audio.export(segment_path, format="wav")
        else:
            start_sample = int(start_time * sample_rate)
            end_sample = int(end_time * sample_rate)
            segment_samples = audio_data[start_sample:end_sample]
            sf.write(segment_path, segment_samples, sample_rate)

        csv_data.append({
            'filename': segment_filename,
            'start_time': start_time,
            'end_time': end_time,
            'duration': end_time - start_time,
            'text': text
        })

    df = pd.DataFrame(csv_data)
    csv_path = os.path.join(output_dir, f"{file_name_without_ext}_segments_{suffix}.csv")
    df.to_csv(csv_path, index=False, encoding='utf-8-sig')

    return csv_path

output_directory = "vocals_segments"
csv_file_path = split_audio_by_segments(vocals_file_root, result["segments"], output_directory)

print(f"Audio segments saved to: {output_directory}")
print(f"CSV file saved to: {csv_file_path}")

Audio segments saved to: vocals_segments
CSV file saved to: vocals_segments/vocals_segments_M.csv


In [None]:
import os
import zipfile

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname)
    return True

folder_to_zip = "/content/vocals_segments"
zip_file_path = "/content/vocals_segments.zip"

zip_folder(folder_to_zip, zip_file_path)

True