In [None]:
!pip install torch
!pip install pydub
!pip install openai-whisper

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━

In [None]:
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from pydub.generators import Sine
import torch

# Step 1: Transcribe video using Whisper
def transcribe_video(video_file):
    print("Loading Whisper model...")
    model = whisper.load_model("base")
    print("Transcribing video...")
    result = model.transcribe(video_file)
    return result["text"], result["segments"]

# Step 2: Detect profanity using RoBERTa
def detect_profanity(text, tokenizer, model):
    words = text.split()
    profane_words = []
    for word in words:
        inputs = tokenizer(word, return_tensors="pt", truncation=True)
        outputs = model(**inputs)
        scores = torch.softmax(outputs.logits, dim=1).tolist()[0]
        if scores[1] > 0.6:  # Threshold for profanity detection
            profane_words.append(word)
    return profane_words

# Step 3: Generate beep sound
def generate_beep(duration, frequency=1000):
    return Sine(frequency).to_audio_segment(duration=duration * 1000)

# Step 4: Add beep to profane word timestamps
def add_beep_to_audio(audio_file, segments, profane_words, output_audio_file, buffer_ms=200):
    print("Adding beeps to audio...")
    audio = AudioSegment.from_file(audio_file)

    for segment in segments:
        segment_start_time = segment["start"] * 1000
        segment_end_time = segment["end"] * 1000

        words = segment["text"].split()
        word_start_time = segment_start_time
        word_duration = (segment_end_time - segment_start_time) / len(words)

        for word in words:
            word_end_time = word_start_time + word_duration
            if word in profane_words:
                # Calculate beep duration proportional to word duration plus buffer
                beep_duration = (word_duration + 2 * buffer_ms) / 1000  # Convert ms to seconds
                beep = generate_beep(duration=beep_duration)

                # Adjust start and end times to include buffer for smoother beeping
                beep_start = max(0, word_start_time - buffer_ms)
                beep_end = min(len(audio), word_end_time + buffer_ms)

                print(f"Beeping word: {word} from {beep_start}ms to {beep_end}ms")
                audio = audio[:int(beep_start)] + beep + audio[int(beep_end):]

            word_start_time = word_end_time

    audio.export(output_audio_file, format="wav")
    print(f"Modified audio saved to {output_audio_file}")


# Main function
def process_video(video_file):
    transcription_text, segments = transcribe_video(video_file)
    print("Loading profanity detection model...")
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-offensive")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-offensive")
    print("Detecting profane words...")
    profane_words = detect_profanity(transcription_text, tokenizer, model)
    print(f"Profane words detected: {profane_words}")

    audio_file = "temp_audio.wav"
    modified_audio_file = "modified_audio_with_beeps.wav"
    video = VideoFileClip(video_file)
    video.audio.write_audiofile(audio_file, codec="pcm_s16le")
    add_beep_to_audio(audio_file, segments, profane_words, modified_audio_file)

    os.remove(audio_file)

# Run the process
if __name__ == "__main__":
    video_file = "RAM.mp4"  # Replace with your video file
    process_video(video_file)


Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)




Transcribing video...
Loading profanity detection model...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Detecting profane words...
Profane words detected: ['fucking', 'assholes!', 'Goddammit!', 'Goddammit!', 'dumb.', 'fucking', 'motherfucker?', 'fucking', 'Dammit.']
MoviePy - Writing audio in temp_audio.wav




MoviePy - Done.
Adding beeps to audio...
Beeping word: fucking from 25660.0ms to 26270.0ms
Beeping word: assholes! from 31586.666666666668ms to 32440.0ms
Beeping word: Goddammit! from 42520.0ms to 43320.0ms
Beeping word: Goddammit! from 44786.66666666667ms to 45520.00000000001ms
Beeping word: dumb. from 129760.0ms to 130440.0ms
Beeping word: fucking from 144160.0ms to 144840.0ms
Beeping word: motherfucker? from 148968.0ms to 149680.0ms
Beeping word: fucking from 156325.71428571432ms to 156937.1428571429ms
Beeping word: Dammit. from 168680.0ms to 169387ms
Modified audio saved to modified_audio_with_beeps.wav
