## Install dependencies

In [None]:
!pip install datasets transformers torch pandas tqdm


## Load dataset

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face (your YouTube comments dataset)
data = load_dataset("AmaanP314/youtube-comment-sentiment")

# Show available splits
print(data)

# Convert the train split to pandas DataFrame
df = data['train'].to_pandas()

print(f" Dataset loaded: {len(df)} comments")
print(f"Columns: {list(df.columns)}")
df.head()


## Choose text column


In [None]:
# Pick the correct column for comments
if 'text' in df.columns:
    comments = df["text"].astype(str).tolist()
elif 'comment' in df.columns:
    comments = df["comment"].astype(str).tolist()
elif 'CommentText' in df.columns:
    comments = df["CommentText"].astype(str).tolist()
else:
    print("Available columns:", df.columns.tolist())
    text_col = df.columns[0]  # fallback
    comments = df[text_col].astype(str).tolist()
    print(f" Using column: {text_col}")


## Load model & tokenizer

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline

# Pick best device (GPU if available)
device = 0 if torch.cuda.is_available() else -1
print(f" Using device: {'cuda' if device == 0 else 'cpu'}")

# Load pretrained toxicity detection model
model_path = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create pipeline
pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=device,
    batch_size=32,      # increase if you have >12GB GPU
    truncation=True,
    padding=True
)


## Run inference with checkpointing

In [None]:
import os, pickle
from tqdm import tqdm

checkpoint_file = "toxicity_progress.pkl"
results = []
start_idx = 0

# Resume if checkpoint exists
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        checkpoint = pickle.load(f)
        results = checkpoint['results']
        start_idx = checkpoint['last_index']
        print(f"📁 Resuming from index {start_idx} ({len(results)} results already processed)")

# Process comments in batches
for i in tqdm(range(start_idx, len(comments), 1000), desc="Processing batches"):
    batch = comments[i:i+1000]
    batch_results = pipeline(batch)
    results.extend(batch_results)

    # Save checkpoint
    checkpoint = {
        'results': results,
        'last_index': i + 1000,
        'total_comments': len(comments)
    }
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(checkpoint, f)

    print(f" Checkpoint saved at index {i + 1000}")

print(f" Processing complete! Total results: {len(results)}")


## Save labeled dataset

In [None]:
# Extract labels and scores
labels = [r["label"] for r in results]
scores = [r["score"] for r in results]

# Add new columns
df["ToxicLabel"] = labels
df["ToxicScore"] = scores

# Save CSV with timestamp
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"youtube_comments_with_toxicity_{timestamp}.csv"
df.to_csv(output_file, index=False)

print(f" Labeled dataset saved as: {output_file}")


## Download CSV

In [None]:
from google.colab import files
files.download(output_file)


## Check the downloaded file 


In [2]:
import pandas as pd, os, glob, json, math
from pathlib import Path
from datetime import datetime

# Target file (as you specified)
explicit_path = Path('Notebooks/youtube_comments_with_toxicity_20250914_061551.csv')

# Fallback search pattern(s)
patterns = [
    'Notebooks/youtube_comments_with_toxicity_*.csv',
    'youtube_comments_with_toxicity_*.csv'
]

candidate_file = None
if explicit_path.exists():
    candidate_file = explicit_path
else:
    matched = []
    for pat in patterns:
        matched.extend(glob.glob(pat))
    if matched:
        candidate_file = Path(sorted(matched)[-1])  # latest by lexicographic timestamp

if candidate_file is None:
    print('❌ Could not find any toxicity output CSV. Check the path or rerun inference cell.')
else:
    print(f'📄 Using file: {candidate_file}')
    size_mb = candidate_file.stat().st_size / (1024*1024)
    print(f'   Size: {size_mb:.2f} MB')

    df_inspect = pd.read_csv(candidate_file)
    print('\n=== Shape ===')
    print(df_inspect.shape)

    print('\n=== Columns ===')
    print(list(df_inspect.columns))

    print('\n=== Dtypes ===')
    print(df_inspect.dtypes)

    print('\n=== Head (5) ===')
    display(df_inspect.head())

    if len(df_inspect) > 10:
        print('\n=== Random Sample (5) ===')
        display(df_inspect.sample(5, random_state=42))

    # Basic stats for score column
    if 'ToxicScore' in df_inspect.columns:
        print('\n=== ToxicScore Stats ===')
        print(df_inspect['ToxicScore'].describe())

    # Label distribution
    if 'ToxicLabel' in df_inspect.columns:
        print('\n=== ToxicLabel Distribution ===')
        print(df_inspect['ToxicLabel'].value_counts())
        print('\nPercentages:')
        print((df_inspect['ToxicLabel'].value_counts(normalize=True) * 100).round(2).astype(str) + '%')

    # Null counts
    print('\n=== Null Values Per Column ===')
    print(df_inspect.isna().sum())

    # Save a lightweight JSON summary
    summary = {
        'file': str(candidate_file),
        'rows': int(df_inspect.shape[0]),
        'cols': int(df_inspect.shape[1]),
        'columns': list(df_inspect.columns),
        'label_counts': df_inspect['ToxicLabel'].value_counts().to_dict() if 'ToxicLabel' in df_inspect.columns else None,
        'score_stats': df_inspect['ToxicScore'].describe().to_dict() if 'ToxicScore' in df_inspect.columns else None,
        'generated_at': datetime.utcnow().isoformat() + 'Z'
    }
    with open('toxicity_dataset_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)
    print('\n📝 Wrote summary -> toxicity_dataset_summary.json')

📄 Using file: youtube_comments_with_toxicity_20250914_061551.csv
   Size: 310.80 MB

=== Shape ===
(1032225, 14)

=== Columns ===
['CommentID', 'VideoID', 'VideoTitle', 'AuthorName', 'AuthorChannelID', 'CommentText', 'Sentiment', 'Likes', 'Replies', 'PublishedAt', 'CountryCode', 'CategoryID', 'ToxicLabel', 'ToxicScore']

=== Dtypes ===
CommentID           object
VideoID             object
VideoTitle          object
AuthorName          object
AuthorChannelID     object
CommentText         object
Sentiment           object
Likes                int64
Replies              int64
PublishedAt         object
CountryCode         object
CategoryID           int64
ToxicLabel          object
ToxicScore         float64
dtype: object

=== Head (5) ===


Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1,non-toxic,0.998745
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17,non-toxic,0.996063
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27,non-toxic,0.997976
3,UgwMOh95MfK0GuXLLrF4AaABAg,31KTdfRH6nY,Building web applications in Java with Spring ...,@finnianthehuman,UCwQ2Z03nOcMxWozBb_Cv66w,Thanks for the great video.\n\nI don't underst...,Neutral,0,1,2024-09-18 12:04:12,US,27,non-toxic,0.999134
4,UgxJuUe5ysG8OSbABAl4AaABAg,-hV6aeyPHPA,After a new engine her car dies on her way hom...,@ryoutubeplaylistb6137,UCTTcJ0tsAKQokmHB2qVb1qQ,Good person helping good people.\nThis is how ...,Positive,3,1,2025-01-10 19:39:03,US,2,non-toxic,0.998069



=== Random Sample (5) ===


Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore
986142,UgwiO7VtWRufcVOG1QF4AaABAg_aug,DplxIq0mc_Y,C pointers explained👉,AugmentedUser,AugmentedCID,This video was quite hard to follow and unders...,Negative,0,0,2025-02-02 11:53:15,CA,27,non-toxic,0.99889
865071,UgxYhC49FiRv9aY25Ih4AaABAg,p5P6nlQUt1U,SHAPIRO: 5 Reasons Why CNN Is Fake News,@Patricia-f4j,UCvXQidulAHfALeF3Qy3SrVg,Love your show ❤️ 🤍💙 and appreciate your coura...,Positive,0,0,2024-02-15 20:56:21,US,25,non-toxic,0.99868
991085,UgyNHq3yG5DBx3csprV4AaABAg,txKBWtvV99Y,3 Mini Python Projects - For Intermediates,@ΧρήστοςΨυρούκης,UC-ol9fM3UsDwdu9BT9OPdOQ,what color theme do you use \nPlease tell me,Neutral,0,0,2022-03-25 16:04:30,US,27,non-toxic,0.998435
105074,UgwSB6eAK9Qv_bOnLqV4AaABAg,uwNhtP6svxI,Mugshot Comes Back To Haunt Congressman Matt G...,@missbrown2041,UCbvvZm9DMqvLeZz0oCZ2FSw,The one's who CRY ABOUT ANYTHING THAT THEY HAV...,Negative,1,0,2022-03-11 19:54:11,US,25,non-toxic,0.988574
698649,UgxUIVRsgz_KM3yhxVB4AaABAg,EVb91vPC7hs,PBBM GALIT NA KAY DIGONG! DUTERTE BAKIT PIKON ...,@antoniodalumpines158,UCA25u3uBfq_p2A8vm48aFgg,Mr. Duterte.. magpahinga ka na please.. gayahi...,Neutral,16,1,2025-01-21 12:43:46,PH,25,non-toxic,0.68067



=== ToxicScore Stats ===
count    1.032225e+06
mean     9.798927e-01
std      6.221688e-02
min      5.000129e-01
25%      9.935626e-01
50%      9.980566e-01
75%      9.988497e-01
max      9.992611e-01
Name: ToxicScore, dtype: float64

=== ToxicLabel Distribution ===
ToxicLabel
non-toxic    984277
toxic         47948
Name: count, dtype: int64

Percentages:
ToxicLabel
non-toxic    95.35%
toxic         4.65%
Name: proportion, dtype: object

=== Null Values Per Column ===
CommentID            0
VideoID              0
VideoTitle           0
AuthorName         631
AuthorChannelID      0
CommentText          0
Sentiment            0
Likes                0
Replies              0
PublishedAt          0
CountryCode          0
CategoryID           0
ToxicLabel           0
ToxicScore           0
dtype: int64

📝 Wrote summary -> toxicity_dataset_summary.json


  'generated_at': datetime.utcnow().isoformat() + 'Z'


### Inspect saved toxicity-labeled CSV
The next cell loads the CSV you generated (`Notebooks/youtube_comments_with_toxicity_20250914_061551.csv`), reports:
- File existence & size
- Row / column counts
- Column names & dtypes
- First & random sample rows
- Label distribution (`ToxicLabel`)
- Basic score statistics
- Null value counts
If the exact timestamped file isn't found, it will try to pick the most recent matching `youtube_comments_with_toxicity_*.csv` in the current directory or `Notebooks/` folder.