In [None]:
# ------------------------ Install Required Packages ------------------------------

!pip install -q git+https://github.com/openai/whisper.git
!pip install -q ffmpeg-python librosa numpy
!sudo apt-get install -y ffmpeg

In [None]:
# ------------------------ TRANSCRIPTION MODEL (WHISPER AI) ------------------------------

#----------------- IMPORTS ------------------
import whisper
import librosa
import datetime
import numpy as np
from IPython.display import Audio
from google.colab import files

#----------------- FILE UPLOAD ------------------
uploaded = files.upload()
file_path = next(iter(uploaded))  # Get the uploaded file name
Audio(file_path)  # Optional: play uploaded audio

#----------------- WHISPER AI MODEL ------------------
def transcribe_audio_with_context(file_path, model_size="base"):
    model = whisper.load_model(model_size)
    result = model.transcribe(file_path, verbose=True)
    return result["segments"], result["text"]

#----------------- AUDIO ANALYSIS ------------------
def analyze_audio(file_path):
    y, sr = librosa.load(file_path)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
    return y, sr, beats

#----------------- HELPER: TIMESTAMP FORMATTER ------------------
def format_timestamp(seconds):
    return str(datetime.timedelta(seconds=int(seconds)))

#----------------- SEGMENT TAGGING WITH CONTEXT ------------------
def tag_segments_with_context(segments, y, sr, beats):
    tagged = []
    beat_times = librosa.frames_to_time(beats, sr=sr)

    for segment in segments:
        start = segment['start']
        end = segment['end']
        text = segment['text'].strip()

        segment_audio = y[int(start * sr):int(end * sr)]
        energy = np.mean(np.square(segment_audio))
        timestamp = f"[{format_timestamp(start)} → {format_timestamp(end)}]"

        # Check for background music or OST based on energy and beat alignment
        if len(text.strip()) == 0:
            if energy > 0.015 and any(abs(start - bt) < 1.0 for bt in beat_times):
                tagged.append(f"{timestamp} [Chorus]")
            elif energy > 0.01:
                tagged.append(f"{timestamp} [OST]")
            else:
                continue  # skip low energy silences
        else:
            tagged.append(f"{timestamp} {text}")

    return "\n".join(tagged)

#----------------- EXECUTION ------------------
segments, _ = transcribe_audio_with_context(file_path)
y, sr, beats = analyze_audio(file_path)
final_transcription = tag_segments_with_context(segments, y, sr, beats)

#----------------- OUTPUT ------------------
print("\n Final Transcription with Context:\n")
print(final_transcription)
