In [12]:

# --- Profanity Censorship data preparation script ---

import pandas as pd 

# --- Load the CSV file ---
profanity_df = pd.read_csv("profanity.csv")

# --- Normalize columns ---
profanity_df["text"] = profanity_df["text"].str.lower().str.strip()
profanity_df["severity_description"] = profanity_df["severity_description"].str.lower().str.strip()

# --- Create sets of swear words ---
low_swear_words = set(profanity_df[profanity_df["severity_description"].isin(["mild", "strong", "severe"])]["text"])
mid_swear_words = set(profanity_df[profanity_df["severity_description"].isin(["strong", "severe"])]["text"])
strong_swear_words = set(profanity_df[profanity_df["severity_description"] == "severe"]["text"])

# --- Choose which level to censor ---
swear_words = low_swear_words  # Change this to low_swear_words or strong_swear_words as needed




In [13]:

# --- Main Censorship Script ---

import os, re, whisper, subprocess, tempfile, shutil

# --- Config ---
video_path = r"sample_video.mp4"                                   # Input video file path
bleep_path = r"bleep.mp3"                                         # Bleep sound file path 
output_final = os.path.splitext(video_path)[0] + "_censored.mp4"  # Output video file path
model_size = "small"                                              # Whisper model size: tiny, base, small, medium, large

# --- Parameters ---
MAX_PASSES = 10     # Maximum censorship passes
PAD = 0.15          # 150ms padding around each word
MIN_PROB = 0.8      # Minimum confidence

# --- Load Whisper ---
print("Loading Whisper model...")
model = whisper.load_model(model_size)

# --- Function: censor a single pass ---
def censor_pass(input_video, output_video):
    result = model.transcribe(input_video, word_timestamps=True)
    mute_intervals = []

    for segment in result["segments"]:
        if "words" not in segment:
            continue
        for w in segment["words"]:
            clean = re.sub(r"[^a-z]", "", w["word"].lower())
            if clean in swear_words and w.get("prob", 1.0) > MIN_PROB:
                start = max(0, w["start"] - PAD)
                end = w["end"] + PAD
                mute_intervals.append((start, end))
                print(f"💢 Found profanity: '{clean}' {start:.2f}-{end:.2f}s")

    if not mute_intervals:
        shutil.copy(input_video, output_video)
        return 0

    # --- Build FFmpeg filter ---
    volume_filter = " + ".join([f"between(t,{s},{e})" for s, e in mute_intervals])
    filters = [f"[0:a]volume=enable='{volume_filter}':volume=0[base]"]

    overlay_parts = []
    for i, (s, e) in enumerate(mute_intervals):
        dur = e - s
        delay_ms = int(s * 1000)
        overlay_parts.append(
            f"[1:a]atrim=0:{dur},adelay={delay_ms}|{delay_ms},volume=1[a{i}]"
        )

    mix_inputs = "[base]" + "".join([f"[a{i}]" for i in range(len(mute_intervals))])
    filters += overlay_parts
    filters.append(f"{mix_inputs}amix=inputs={len(mute_intervals)+1}:duration=longest[aout]")

    filter_str = ";".join(filters)

    cmd = [
        "ffmpeg", "-y",
        "-i", input_video,
        "-i", bleep_path,
        "-filter_complex", filter_str,
        "-map", "0:v", "-map", "[aout]",
        "-c:v", "copy", "-c:a", "aac", output_video
    ]
    subprocess.run(cmd, check=True)
    return len(mute_intervals)

# --- Iterative loop ---
passes = 0
current_input = video_path
tmp_dir = tempfile.mkdtemp()

print("\nStarting iterative censorship...\n")

while passes < MAX_PASSES:
    passes += 1
    temp_output = os.path.join(tmp_dir, f"pass_{passes}.mp4")
    print(f"\n🔁 Pass {passes}: scanning & censoring...")
    hits = censor_pass(current_input, temp_output)

    if hits == 0:
        print(f"\n✅ Video is clean after {passes} pass(es)!")
        shutil.copy(temp_output if passes > 1 else current_input, output_final)
        break
    else:
        print(f"{hits} profanities censored; re-checking...")
        current_input = temp_output
else:
    print(f"\n Max passes ({MAX_PASSES}) reached; stopping.")
    shutil.copy(current_input, output_final)

# --- Cleanup ---
shutil.rmtree(tmp_dir)
print(f"\nFinal clean video saved to:\n{output_final}")


Loading Whisper model...

Starting iterative censorship...


🔁 Pass 1: scanning & censoring...




💢 Found profanity: 'motherfucking' 3.27-4.35s
💢 Found profanity: 'motherfucking' 5.37-6.55s
💢 Found profanity: 'fucking' 10.55-11.09s
3 profanities censored; re-checking...

🔁 Pass 2: scanning & censoring...

✅ Video is clean after 2 pass(es)!

Final clean video saved to:
SampleVideo_censored.mp4
