In [21]:

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

!pip install -q transformers datasets huggingface_hub torch torchvision torchaudio --upgrade
!pip install -q kagglehub
!pip install -q opencv-python-headless
!pip install -q ultralytics
!pip install -q better-profanity

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m563.4/563.4 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m888.1/888.1 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m7

In [37]:
import os
import torch
import pandas as pd
import zipfile
import kagglehub
import cv2
import numpy as np
from better_profanity import profanity
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from ultralytics import YOLO
from IPython.display import Video

# Check for GPU
if torch.cuda.is_available():
    print("GPU available:", torch.cuda.get_device_name(0))
    device = "cuda"
else:
    print("No GPU found, running on CPU.")
    device = "cpu"


GPU available: Tesla T4


In [36]:
rwf_path = kagglehub.dataset_download("vulamnguyen/rwf2000")

jigsaw_path = "/kaggle/input/jigsaw-toxic-comment-classification-challenge"
jigsaw_df = pd.DataFrame()
try:
    train_zip_path = os.path.join(jigsaw_path, "train.csv.zip")
    extract_dir = "/kaggle/working/jigsaw"
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(train_zip_path, 'r') as z:
        z.extractall(extract_dir)
    jigsaw_df = pd.read_csv(os.path.join(extract_dir, "train.csv"))
    print(f"Jigsaw dataset loaded with {len(jigsaw_df)} comments.")
except Exception as e:
    print(f"Could not load Jigsaw dataset. Error: {e}")

profanity_dataset_path = kagglehub.dataset_download("tushifire/ldnoobw")

✅ Jigsaw dataset loaded with 159571 comments.


In [35]:
text_model_id = "unitary/toxic-bert"
text_tokenizer = AutoTokenizer.from_pretrained(text_model_id)
text_model = AutoModelForSequenceClassification.from_pretrained(text_model_id)
text_pipeline = pipeline(
    "text-classification",
    model=text_model,
    tokenizer=text_tokenizer,
    truncation=True,
    device=0 if torch.cuda.is_available() else -1
)

detection_model = YOLO('yolov8n.pt')
detection_model.to(device)

Device set to use cuda:0


YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [34]:
profanity_file_path = os.path.join(profanity_dataset_path, 'en')

try:
    with open(profanity_file_path, 'r') as f:
        bad_words_list = [line.strip() for line in f.readlines()]
    
    profanity.load_censor_words(bad_words_list)
    print(f"Loaded {len(bad_words_list)} profane words into the sanitizer.")

except Exception as e:
    print(f"Error loading profanity list: {e}. Using default list.")
    profanity.load_censor_words()

def sanitize_text(text):
    return profanity.censor(text)

def detect_toxicity(text):
    return text_pipeline(text)

✅ Loaded 403 profane words into the sanitizer.


In [33]:
if not jigsaw_df.empty:
    toxic_samples = jigsaw_df[jigsaw_df['toxic'] == 1].head(3)
    for index, row in toxic_samples.iterrows():
        comment = row['comment_text']
        toxicity_result = detect_toxicity(comment)
        sanitized_comment = sanitize_text(comment)
        
        print(comment)
        print(toxicity_result)
        print(sanitized_comment)
        print("\n") 
else:
    print("Jigsaw DataFrame is empty, skipping text moderation test.")
    sample_text = "This is some fucking bullshit, you are a stupid asshole."
    print(sample_text)
    print(detect_toxicity(sample_text))
    print(sanitize_text(sample_text))

COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
[{'label': 'toxic', 'score': 0.9980192184448242}]
COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK


Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?

[{'label': 'toxic', 'score': 0.28589802980422974}]
Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?



Bye! 

Don't look, come or think of comming back! Tosser.
[{'label': 'toxic', 'score': 0.8144686222076416}]
Bye! 

Don't look, come or think of comming back! ****.




In [31]:
def censor_fight_scene_individual(input_path, output_path, padding_ratio=0.2,
                                    pixel_size=20, overlay_alpha=0.5):

    if os.path.exists(output_path):
        os.remove(output_path)

    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error: Could not open video at {input_path}")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = detection_model(frame, classes=[0], verbose=False)
        boxes = [
            [int(i) for i in box.xyxy[0]]
            for result in results for box in result.boxes
        ]

        for box in boxes:
            x1, y1, x2, y2 = box

            box_w, box_h = x2 - x1, y2 - y1
            pad_w, pad_h = int(box_w * padding_ratio), int(box_h * padding_ratio)

            final_x1 = max(0, x1 - pad_w)
            final_y1 = max(0, y1 - pad_h)
            final_x2 = min(width, x2 + pad_w)
            final_y2 = min(height, y2 + pad_h)

            roi = frame[final_y1:final_y2, final_x1:final_x2]
            if roi.size == 0:
                continue

            h, w, _ = roi.shape
            temp = cv2.resize(roi, (pixel_size, pixel_size), interpolation=cv2.INTER_LINEAR)
            censored_roi = cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)

            if overlay_alpha > 0:
                overlay = np.zeros_like(censored_roi)
                censored_roi = cv2.addWeighted(censored_roi, 1 - overlay_alpha, overlay, overlay_alpha, 0)

            frame[final_y1:final_y2, final_x1:final_x2] = censored_roi

        out.write(frame)

    print(f"Video processing complete. Output saved to {output_path}")
    cap.release()
    out.release()
    return output_path

In [32]:

fight_video_path = "/kaggle/input/rwf2000/RWF-2000/val/Fight/0Ow4cotKOuw_0.avi"
output_video_path_individual = "/kaggle/working/fight_censored1.mp4"

censor_fight_scene_individual(
    fight_video_path,
    output_video_path_individual,
    padding_ratio=0.25,     
    pixel_size=20,
    overlay_alpha=0.6
)

Video(output_video_path_individual, embed=True, width=640)

Video processing complete. Output saved to /kaggle/working/fight_censored1.mp4
