<a href="https://colab.research.google.com/github/Umayr7/ToxicCommentClassification/blob/main/ToxicClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import SMOTE
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#DATA PREPROCESSING UTILITY

In [None]:
### Preprocess the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

In [None]:
### Removing rows where comment is empty
def remove_empty_rows(data):
    data = data.dropna(subset=['comment_text'])
    return data

In [None]:
### Removing duplicates
def remove_duplicates(data):
  data = data.drop_duplicates()
  return data

In [None]:
### Removing missing values
def remove_missing_values(data):
  data = data.dropna()
  return data

In [None]:
### Removing id column from data
def remove_id_column(data):
  data = data.drop('id', axis=1)
  return data

# Utility For Saving Preprocessed Data

In [None]:
### Save the cleaned dataset to a new CSV file
def save_preprocessed_data(data):
  cleaned_csv_path = '/content/drive/MyDrive/preprocessed_data_toxic.csv'
  data.to_csv(cleaned_csv_path, index=False)

# Loading Data

In [None]:
pathToDataset = 'https://raw.githubusercontent.com/Umayr7/ToxicCommentClassification/main/train.csv'

### Loading Data from Git Repo
data = pd.read_csv(pathToDataset)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
### Load the cleaned dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
cleaned_csv_path = '/content/drive/MyDrive/preprocessed_data_toxic.csv'
data = pd.read_csv(cleaned_csv_path)

data = data.dropna(subset=['comment_text'])

data.head()

Mounted at /content/drive


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,daww matches background colour im seemingly st...,0,0,0,0,0,0
2,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,cant make real suggestions improvement wondere...,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


# Applying Preprocessing

In [None]:
### Removing duplicates
data = remove_duplicates(data)
len(data)

159571

In [None]:
### Removing missing values
data = remove_missing_values(data)
len(data)

159571

In [None]:
### Cleaning data
data['comment_text'] = data['comment_text'].apply(clean_text)

display(data)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww matches background colour im seemingly st...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cant make real suggestions improvement wondere...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,second time asking view completely contradicts...,0,0,0,0,0,0
159567,ffea4adeee384e90,ashamed horrible thing put talk page,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres actual article prostitution...,0,0,0,0,0,0
159569,fff125370e4aaaf3,looks like actually put speedy first version d...,0,0,0,0,0,0


In [None]:
### Removing id column from data
data = remove_id_column(data)
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,daww matches background colour im seemingly st...,0,0,0,0,0,0
2,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,cant make real suggestions improvement wondere...,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


In [None]:
### Saving Clean Data
save_preprocessed_data(data)

# Feature And Labels

In [7]:
### Prepare the data for training
X = data['comment_text'].values
y = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [8]:
### Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization

In [9]:
### Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=20000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#Logistic Regression

In [10]:
### Train LogReg using OneVsRest strategy
logreg = OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=1000))
logreg.fit(X_train_tfidf, y_train)

In [11]:
### Predict on the test set
y_pred = logreg.predict(X_test_tfidf)

In [12]:
### Print classification report
for i, column in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"Classification report for {column}:\n")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("\n")

# Calculate and print overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.4f}')

Classification report for toxic:

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28907
           1       0.92      0.60      0.73      2999

    accuracy                           0.96     31906
   macro avg       0.94      0.80      0.85     31906
weighted avg       0.96      0.96      0.95     31906



Classification report for severe_toxic:

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31596
           1       0.52      0.21      0.30       310

    accuracy                           0.99     31906
   macro avg       0.76      0.60      0.65     31906
weighted avg       0.99      0.99      0.99     31906



Classification report for obscene:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30196
           1       0.92      0.61      0.74      1710

    accuracy                           0.98     31906
   macro avg 

# SVM

In [13]:
### Train SVM using OneVsRest strategy
svm = LinearSVC()
multi_target_svm = OneVsRestClassifier(svm)
multi_target_svm.fit(X_train_tfidf, y_train)

In [14]:
### Evaluate the model on the test set
y_pred = multi_target_svm.predict(X_test_tfidf)

In [15]:
# Print classification report
for i, column in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"Classification report for {column}:\n")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("\n")

# Calculate and print overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.4f}')

Classification report for toxic:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28907
           1       0.86      0.69      0.76      2999

    accuracy                           0.96     31906
   macro avg       0.91      0.84      0.87     31906
weighted avg       0.96      0.96      0.96     31906



Classification report for severe_toxic:

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31596
           1       0.52      0.26      0.35       310

    accuracy                           0.99     31906
   macro avg       0.76      0.63      0.67     31906
weighted avg       0.99      0.99      0.99     31906



Classification report for obscene:

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     30196
           1       0.88      0.69      0.77      1710

    accuracy                           0.98     31906
   macro avg 

# LSTM

In [16]:
### Tokenize the text
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

In [17]:
### Getting tokenized data
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [18]:
### Pad sequences to ensure uniform input size
max_length = 150
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [19]:
### Define the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='sigmoid'))

In [20]:
### Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
### Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

In [22]:
### Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 4: early stopping


In [23]:
### Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9880


In [24]:
### Make predictions
predictions = model.predict(X_test_pad)
print(predictions)

[[9.9230498e-01 7.5363033e-02 8.8584620e-01 8.1108063e-03 7.1321714e-01
  4.1462786e-02]
 [2.5530005e-04 5.2090554e-06 8.3661260e-05 1.3305640e-05 7.6906654e-05
  3.3449509e-05]
 [1.8146931e-04 1.3852444e-06 2.7108390e-05 1.3434012e-06 2.5909256e-05
  3.5783999e-06]
 ...
 [2.0406672e-04 6.6437942e-06 8.7820488e-05 8.8759161e-06 5.2953546e-05
  2.2785174e-05]
 [1.1210339e-02 1.7145739e-04 1.8142135e-03 1.9799365e-04 1.8320786e-03
  2.1418223e-04]
 [8.3111745e-04 2.6940537e-05 1.2759569e-04 9.1199574e-05 2.2043182e-04
  1.7400627e-04]]


#BERT

In [None]:
### Installing necessary libraries
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn
!pip install nltk

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
### BERT IMPORTS
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments

In [None]:
pathToDataset = 'https://raw.githubusercontent.com/Umayr7/ToxicCommentClassification/main/train.csv'

In [None]:
### Loading Data from Git Repo
data2 = pd.read_csv(pathToDataset)
data2.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
### Loading Dataset
from google.colab import drive
drive.mount('/content/drive')
data2 = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')

Mounted at /content/drive


In [None]:
data2['cleaned_comment_text'] = data2['comment_text'].apply(clean_text)

In [None]:
### Save the cleaned dataset to a new CSV file
cleaned_csv_path = '/content/drive/MyDrive/NLP/cleaned_toxic_comments.csv'
data2.to_csv(cleaned_csv_path, index=False)

In [None]:
### Load the cleaned dataset
cleaned_csv_path = '/content/drive/MyDrive/NLP/cleaned_toxic_comments.csv'
data2 = pd.read_csv(cleaned_csv_path)

In [None]:
len(data2)

159571

In [None]:
### Split the data into train and test sets
# X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(data2['cleaned_comment_text'], data2['toxic'], test_size=0.2, random_state=42)

X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(data2['cleaned_comment_text'].astype(str).tolist(),
                                                                        data2[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist(),
                                                                        test_size=0.2,
                                                                        random_state=42)

In [None]:
# Loading BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize the data
train_encodings = tokenizer(X_train_BERT, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test_BERT, truncation=True, padding=True, max_length=512)

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx])
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = ToxicCommentsDataset(train_encodings, y_train_BERT)
test_dataset = ToxicCommentsDataset(test_encodings, y_test_BERT)

In [None]:
### Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./drive/MyDrive/NLP/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./drive/MyDrive/NLP/logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)



In [None]:
# Initialize Trainer
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0353,0.042165
2,0.0368,0.042144
3,0.0211,0.046488


TrainOutput(global_step=23937, training_loss=0.04031553095609775, metrics={'train_runtime': 8275.1451, 'train_samples_per_second': 46.279, 'train_steps_per_second': 2.893, 'total_flos': 1.0076673349450138e+17, 'train_loss': 0.04031553095609775, 'epoch': 3.0})

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.046488262712955475,
 'eval_runtime': 200.4085,
 'eval_samples_per_second': 159.25,
 'eval_steps_per_second': 9.955,
 'epoch': 3.0}

In [None]:
# Get predictions
# preds_output = trainer.predict(test_dataset)
# preds = np.argmax(preds_output.predictions, axis=1)
preds_output = trainer.predict(test_dataset)
preds = (preds_output.predictions > 0.5).astype(int)

In [None]:
# Calculate metrics
# accuracy = accuracy_score(y_test_BERT, preds)
# precision = precision_score(y_test_BERT, preds)
# recall = recall_score(y_test_BERT, preds)
# f1 = f1_score(y_test_BERT, preds)
# roc_auc = roc_auc_score(y_test_BERT, preds_output.predictions[:, 1])

accuracy = accuracy_score(y_test_BERT, preds)
precision = precision_score(y_test_BERT, preds, average='micro')
recall = recall_score(y_test_BERT, preds, average='micro')
f1 = f1_score(y_test_BERT, preds, average='micro')
roc_auc = roc_auc_score(y_test_BERT, preds_output.predictions, average='micro')

In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Accuracy: 0.9241
Precision: 0.8290
Recall: 0.7079
F1-Score: 0.7637
ROC-AUC: 0.9893
