In [1]:
!pip install tensorflow
!pip install -U scikit-learn
!pip install pandas
!pip install matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.2


In [2]:
import os # to work with different file paths
import pandas as pd # to read tabular data
import tensorflow as tf # deep learning framework
import numpy as np

In [3]:
df = pd.read_csv(os.path.join(r'/content/train.csv')) # os.path.join -> gives full file path

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [6]:
df.iloc[0]

id                                                0000997932d777bf
comment_text     Explanation\nWhy the edits made under my usern...
toxic                                                            0
severe_toxic                                                     0
obscene                                                          0
threat                                                           0
insult                                                           0
identity_hate                                                    0
Name: 0, dtype: object

In [7]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [8]:
# ***** 1.PREPROCESS -: *****
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
aiohttp                          3.9.3
aiosignal                        1.3.1
alabaster                        0.7.16
albumentations                   1.3.1
altair                           4.2.2
annotated-types                  0.6.0
anyio                            3.7.1
appdirs                          1.4.4
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.15.1
astropy                          5.3.4
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.1.0
attrs                            23.2.0
audioread                        3.0.1
autograd                         1.6.2
Babel                            2.14.0
backcall                         0.2.0
beautifulsoup4                   4.12.3


In [9]:
from tensorflow.keras.layers import TextVectorization # TextVectorization layer to tokenize our text

In [10]:
 # TextVectorization  is a preprocessing layer which maps text features to integer sequences

In [11]:
# splitting our dataset into comments and features(labels)
X = df['comment_text']
y = df[df.columns[2:]].values # df.columns[2:] --> slicing ; .values --> to convert it into numpy array, this gives us a file format that we can actually pass through our tensorflow deep learning model

In [12]:
# each comment now has a vector which represents whether or not it falls into one of these categories or not
df[df.columns[2:]].values

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [13]:
MAX_FEATURES = 200000 # number of words in our dictionary, so inside our text vectorization layer we can specify how many words we want to store

In [14]:
# intializing our textVectorization layer
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')
# output_sequence_length --> specifies what is the max length (in tokens) of our sentences going to be


In [15]:
# teaching our vectorizer our vocabulary
vectorizer.adapt(X.values) # adapt ll learn all the words that are inside of our dictionary

In [16]:
vectorized_text = vectorizer(X.values) # going through each and every word and tokenizing it

In [17]:
vectorizer('Hello Alice , Life has been great' )[:6]

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([ 288, 8293,  306,   43,   58,  275])>

In [18]:
vectorized_text # 159571 is the max no of examples i.e len(x) and 1800 (max no of words)  max output_sequence_length

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [19]:
# creating a tensorflow data pipeline
#MCSBP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps  prevent bottlenecks

In [20]:
dataset.as_numpy_iterator().next() # gets one batch as text in its vectorized form and labels

(array([[  71,  345,   52, ...,    0,    0,    0],
        [  27,   48,   15, ...,    0,    0,    0],
        [1976,    2, 2640, ...,    0,    0,    0],
        ...,
        [ 102,  247,   72, ...,    0,    0,    0],
        [1961,   13,  737, ...,    0,    0,    0],
        [  94,   13,  351, ...,    0,    0,    0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 1, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [21]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [22]:
batch_x.shape # 16 samples and each 1800 max words

(16, 1800)

In [23]:
len(dataset) # 9974 batches, actually data is 9974*16 = 159584

9974

In [24]:
len(dataset)*.7

6981.799999999999

In [25]:
int(len(dataset)*.7)

6981

In [26]:
# create our training (70%), validation(20%) and testing(10%) partions
train = dataset.take(int(len(dataset)*.7)) # takin 70% of the length of our dataset i.e assigning 70% to out training partion
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # .skip--> specifies that we want to skip a particular partion
# as our training partion is 70% of all our data , so first we are going to skip that 70% then we are going to take 20% as our validation partition
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # skipping 90 % as this is training and validation data together

In [27]:
len(val)

1994

In [None]:
train_generator=train.as_numpy_iterator()

In [None]:
train_generator.next() # progresssively steps through our batches
# while training our deep learning model ll pass through a batch , do a forward pass, a backward pass , go and update the gradients and then it ll go to the next batch

(array([[   39,   154,   114, ...,     0,     0,     0],
        [  124,     7,   100, ...,     0,     0,     0],
        [    7,    20,  1501, ...,     0,     0,     0],
        ...,
        [   23,    14,     9, ...,     0,     0,     0],
        [    8,    55,   105, ...,     0,     0,     0],
        [    8,    67, 34219, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [None]:
# ***** 2. CREATE SEQUENTIAL MODEL-: *****
# building our deep learning model
# the first layer in the model is the embedding layer, during training it may learn what words are positive,negative or might be subjective or objective
# embedding layer--> almost like a personality test for a word, it ll tell us all about that word which makes it useful for deep learning
from tensorflow.keras.models import Sequential # using sequential api for deep learning
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding # importing layers that ll be needed to build our deep learning model
# starting with LSTM layers
# Bidirectional layers is going to be our modifier & is going to allow us to pass the features or values from our LSTM o/p across the board as we r passing through our sequences
# dropout is a method of regularization
# dense layer is fully connected layer
# we don't need to pass through a pre existing embedding our deep learning neural network is going to learn all of the features that it needs inside of that particular embedding

In [None]:
model = Sequential() # instantiate our model

# Create the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))
# MAX_FEATURES+1-->we are passing through how many words that many different embeddings ll be required i.e one embedding per word and our embeddings ll be 2000001 in length & 32 values in long
#32--> features we are going to have in that embedding

# Bidirectional LSTM Layer
# LSTM layer is going to have 32 different LSTM units
# specify an activation of tanh--> as GPU acceleration that is req for an lstm layout needs to be tanh (dictated by tensorflow)
# Bidirectional --> allows us to pass info backwards & forwards across our LSTM layers ex> " I don't hate u" , bidirectional is useful for sentences coz words prior to a current word ll still have meaning, might even modify the meaning
# if our neural network is looking at it purely from left to right, it might see hate as the last value and interpret that as a negative statement but coz "don't" is the prev modifier that modifies the o/p meaning thus imaplementing bidirectional is helpful specially when implementing NLP
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu')) # final dense layer with 128 units & an activation of 'relu'

# Final layer
model.add(Dense(6, activation='sigmoid')) # o/p is going to be btw 0 & 1 as we are using sigmoid activation fn

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
# training
# Each epoch represents one pass through the entire training dataset.
# here we are going through the total dataset 1 time as epochs=1
history = model.fit(train, epochs=1, validation_data=val)

 205/6981 [..............................] - ETA: 3:20:25 - loss: 0.1961

KeyboardInterrupt: 

In [None]:
history.history

In [None]:
from matplotlib import pyplot as plt # to visualize our loss metrics

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
# ***** MAKE PREDICTIONS-: *****
batch=test.as_numpy_iterator().next()
input_text = vectorizer('I love you')

In [None]:
input_text

In [None]:
np.expand_dims(input_text,0) # the i/p shape our model is expecting

In [None]:
df.columns[2:]

In [None]:
model.predict(np.array([input_text])) # model.predict(np.expand_dims(input_text,0))

In [None]:
test.as_numpy_iterator().next()

In [None]:
batch_x,batch_y = test.as_numpy_iterator().next()

In [None]:
model.predict(batch_x) # multiple comments being passed at a time

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

In [None]:
# ***** EVALUATING OUR MODEL-: *****
# since we have multiple binary o/p we can use binary classification metrics like precision and recall
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
# instantiate each of the metrics
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): # loop through eaach batch inside our data pipeline
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true) # passing our tokenized comments

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat) # making the updates based on the current batch of data
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()},Accuracy:{acc.result().numpy()}')

In [None]:

# For Fetching Comments
from googleapiclient.discovery import build


In [None]:

API_KEY = 'AIzaSyCVHgZKcD0vkshrEJRvBKRD0Ub56BLC_HI'

youtube = build('youtube', 'v3', developerKey=API_KEY) # initializing Youtube API

# Getting the channelId of the video uploader
video_response = youtube.videos().list(
    part='snippet',
    id=video_id
).execute()

# Splitting the response for channelID
video_snippet = video_response['items'][0]['snippet']
uploader_channel_id = video_snippet['channelId']

In [None]:
import re

def extract_video_and_channel_ids(youtube_link):
    # Extract video ID from the YouTube link
    video_id_match = re.search(r'(?<=v=)[^&#]+', youtube_link)
    video_id = video_id_match.group() if video_id_match else None

    # Fetch channel ID associated with the video
    video_response = youtube.videos().list(
        part='snippet',
        id=video_id
    ).execute()

    # Extract channel ID from the video response
    video_snippet = video_response['items'][0]['snippet']
    uploader_channel_id = video_snippet['channelId']

    return video_id, uploader_channel_id


In [None]:
# Define a function to fetch comments from YouTube and classify them as toxic or not
def fetch_and_classify_comments(youtube_link):
    # Extract video ID and channel ID from the YouTube link
    video_id, uploader_channel_id = extract_video_and_channel_ids(youtube_link)

    # Define a function to check if a comment is toxic
    def is_toxic(comment):
        vectorized_comment = vectorizer([comment])
        results = model.predict(vectorized_comment)

        toxic_labels = []
        # Check if any of the 6 labels is true (indicating toxicity)
        for idx, col in enumerate(df.columns[2:]):
            if results[0][idx] > 0.5:
                toxic_labels.append(col)

        # If any toxic label is identified, return True
        if toxic_labels:
            return True
        else:
            return False

    # Initialize list to store toxic comments
    toxic_comments = []

    # Fetch comments from YouTube
    print("Fetching Comments...")
    nextPageToken = None
    while len(toxic_comments) < 600:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=1000,  # You can fetch up to 1000 comments per request
            pageToken=nextPageToken
        )
        response = request.execute()
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            # Check if the comment is not from the video uploader
            if comment['authorChannelId']['value'] != uploader_channel_id:
                if is_toxic(comment['textDisplay']):
                    toxic_comments.append(comment['textDisplay'])
        nextPageToken = response.get('nextPageToken')

        if not nextPageToken:
            break

    # Output toxic comments
    return toxic_comments

# Function to take YouTube link from the user and fetch & classify toxic comments
def fetch_and_classify_toxic_comments_from_user_input():
    youtube_link = input("Enter the YouTube video link: ")
    toxic_comments = fetch_and_classify_comments(youtube_link)
    print("Toxic Comments:")
    for idx, comment in enumerate(toxic_comments):
        print(f"Toxic Comment {idx+1}: {comment}")

# Call the function to fetch and classify toxic comments from user input
fetch_and_classify_toxic_comments_from_user_input()




In [None]:
# fn for interface
def get_toxic_comments(youtube_link):
    toxic_comments = fetch_and_classify_comments(youtube_link)
    return "\n".join(toxic_comments)


In [None]:
# TEST & Interface using Gradio API

In [None]:
!pip install gradio==3.43.1

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicityDetection.h5')

In [None]:
model = tf.keras.models.load_model('toxicityDetection.h5')

In [None]:
input_str = vectorizer('I love you, you are sooo inspiring')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
iface = gr.Interface(
    fn=get_toxic_comments,
    inputs="text",
    outputs="text",
    title="YouTube Toxic Comments Extractor",
    description="Enter a YouTube video link to extract and display all the toxic comments on the video.",
)

iface.launch()