# New section

In [None]:
!pip install torch transformers pandas scikit-learn youtube-transcript-api beautifulsoup4 requests

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/Harmful.csv")

df = df[['transcript']].dropna()
df['label'] = 1

safe_text = [
    "this video discusses current events",
    "general discussion and opinions",
    "educational tutorial video",
    "news analysis and reporting",
    "public awareness content",
    "interview and discussion",
    "informative session for viewers",
    "general knowledge video",
    "technology explanation",
    "social media discussion"
] * 1000

safe_df = pd.DataFrame({
    "transcript": safe_text,
    "label": 0
})

df = pd.concat([df, safe_df], ignore_index=True)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["transcript"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_enc = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_ds = Dataset(train_enc, train_labels)
test_ds = Dataset(test_enc, test_labels)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

args = TrainingArguments(
    output_dir="bert_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

trainer.train()

model.save_pretrained("bert_model")
tokenizer.save_pretrained("bert_model")


In [None]:
import torch
import requests
from transformers import BertTokenizer, BertForSequenceClassification
from youtube_transcript_api import YouTubeTranscriptApi
from bs4 import BeautifulSoup

tokenizer = BertTokenizer.from_pretrained("bert_model")
model = BertForSequenceClassification.from_pretrained("bert_model")
model.eval()

with open("/content/en.txt", encoding="utf-8", errors="ignore") as f:
    bad_words = [w.strip().lower() for w in f if w.strip()]

def extract_video_id(url):
    if "youtu.be/" in url:


        return url.split("youtu.be/")[1].split("?")[0]
    if "v=" in url:
        return url.split("v=")[1].split("&")[0]
    if "/live/" in url:
        return url.split("/live/")[1].split("?")[0]
    raise ValueError("Invalid YouTube URL")

def get_text(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join(x["text"] for x in transcript), "Transcript"
    except:
        html = requests.get(f"https://www.youtube.com/watch?v={video_id}").text
        soup = BeautifulSoup(html, "html.parser")
        title = soup.title.text if soup.title else ""
        desc = soup.find("meta", {"name": "description"})
        return title + " " + (desc["content"] if desc else ""), "Title + Description"

def find_bad_words(text):
    t = text.lower()
    return list(set([w for w in bad_words if w in t]))

url = input("Enter YouTube video URL: ")
video_id = extract_video_id(url)

text, source = get_text(video_id)

inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)

with torch.no_grad():
    output = model(**inputs)
    probs = torch.softmax(output.logits, dim=1)
    bert_score = probs[0][1].item()

hits = find_bad_words(text)

severity = min(bert_score + 0.04 * len(hits), 1.0)

if severity < 0.3:
    label = "NOT HARMFUL"
elif severity < 0.6:
    label = "MODERATE"
else:
    label = "HARMFUL"

print("\n========== RESULT ==========")
print("Text Source:", source)
print("Final Classification:", label)
print("Severity Score:", round(severity, 2))
print("Detected Harmful Words:", hits[:15])


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
SAVE_PATH = "/content/drive/MyDrive/youtube_harmful_content_filtering/bert_model"


In [None]:
import os
os.makedirs(SAVE_PATH, exist_ok=True)


In [None]:
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained(SAVE_PATH)
model = BertForSequenceClassification.from_pretrained(SAVE_PATH)
