<a href="https://colab.research.google.com/github/Umayr7/ToxicCommentClassification/blob/main/ToxicClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

#DATA PREPROCESSING UTILITY

In [None]:
### Preprocess the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

In [None]:
### Removing rows where comment is empty
def remove_empty_rows(data):
    data = data.dropna(subset=['comment_text'])
    return data

In [None]:
### Removing duplicates
def remove_duplicates(data):
  data = data.drop_duplicates()
  return data

In [None]:
### Removing missing values
def remove_missing_values(data):
  data = data.dropna()
  return data

# Utility For Saving Preprocessed Data

In [None]:
### Save the cleaned dataset to a new CSV file
def save_preprocessed_data(data):
  cleaned_csv_path = '/content/drive/MyDrive/preprocessed_data_toxic.csv'
  data.to_csv(cleaned_csv_path, index=False)

# Loading Data

In [None]:
pathToDataset = 'https://raw.githubusercontent.com/Umayr7/ToxicCommentClassification/main/train.csv'

### Loading Data from Git Repo
data = pd.read_csv(pathToDataset)
data.head()

In [None]:
### Load the cleaned dataset
cleaned_csv_path = '/content/drive/MyDrive/NLP/cleaned_toxic_comments.csv'
data = pd.read_csv(cleaned_csv_path)

# Applying Preprocessing

#Logistic Regression

In [None]:
### DO NOT RUN THIS CELL FOR NOW
### Download stopwords if necessary
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
### Load the dataset
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Checking data length before preprocessing
len(data)

159571

In [None]:
### Removing duplicates
data = data.drop_duplicates()
len(data)

159571

In [None]:
### Removing missing values
data = data.dropna()
len(data)

159571

In [None]:
### Clean the comment_text column
data['cleaned_comment_text'] = data['comment_text'].apply(clean_text)

display(data)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comment_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww matches background colour im seemingly st...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really trying edit war guy constant...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggestions improvement wondere...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page thats
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,second time asking view completely contradicts...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,ashamed horrible thing put talk page
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,spitzer umm theres actual article prostitution...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,looks like actually put speedy first version d...


In [None]:
### Applying SMOTE to resolve class imbalancing

X = data['cleaned_comment_text']
y = data['toxic']

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(X)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_tfidf, y)

In [None]:
### Data split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
### Fitting Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [None]:
### Validating model on Test Data
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]

In [None]:
### Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Accuracy: 0.9294
Precision: 0.9169
Recall: 0.9444
F1-Score: 0.9305
ROC-AUC: 0.9765


#BERT

In [None]:
### Installing necessary libraries
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn
!pip install nltk

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
### BERT IMPORTS
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments

In [None]:
pathToDataset = 'https://raw.githubusercontent.com/Umayr7/ToxicCommentClassification/main/train.csv'

In [None]:
### Loading Data from Git Repo
data2 = pd.read_csv(pathToDataset)
data2.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
### Loading Dataset
from google.colab import drive
drive.mount('/content/drive')
data2 = pd.read_csv('/content/drive/MyDrive/NLP/train.csv')

Mounted at /content/drive


In [None]:
data2['cleaned_comment_text'] = data2['comment_text'].apply(clean_text)

In [None]:
### Save the cleaned dataset to a new CSV file
cleaned_csv_path = '/content/drive/MyDrive/NLP/cleaned_toxic_comments.csv'
data2.to_csv(cleaned_csv_path, index=False)

In [None]:
### Load the cleaned dataset
cleaned_csv_path = '/content/drive/MyDrive/NLP/cleaned_toxic_comments.csv'
data2 = pd.read_csv(cleaned_csv_path)

In [None]:
len(data2)

159571

In [None]:
### Split the data into train and test sets
# X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(data2['cleaned_comment_text'], data2['toxic'], test_size=0.2, random_state=42)

X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(data2['cleaned_comment_text'].astype(str).tolist(),
                                                                        data2[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist(),
                                                                        test_size=0.2,
                                                                        random_state=42)

In [None]:
# Loading BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize the data
train_encodings = tokenizer(X_train_BERT, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test_BERT, truncation=True, padding=True, max_length=512)

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx])
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = ToxicCommentsDataset(train_encodings, y_train_BERT)
test_dataset = ToxicCommentsDataset(test_encodings, y_test_BERT)

In [None]:
### Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./drive/MyDrive/NLP/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./drive/MyDrive/NLP/logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)



In [None]:
# Initialize Trainer
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0353,0.042165
2,0.0368,0.042144
3,0.0211,0.046488


TrainOutput(global_step=23937, training_loss=0.04031553095609775, metrics={'train_runtime': 8275.1451, 'train_samples_per_second': 46.279, 'train_steps_per_second': 2.893, 'total_flos': 1.0076673349450138e+17, 'train_loss': 0.04031553095609775, 'epoch': 3.0})

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.046488262712955475,
 'eval_runtime': 200.4085,
 'eval_samples_per_second': 159.25,
 'eval_steps_per_second': 9.955,
 'epoch': 3.0}

In [None]:
# Get predictions
# preds_output = trainer.predict(test_dataset)
# preds = np.argmax(preds_output.predictions, axis=1)
preds_output = trainer.predict(test_dataset)
preds = (preds_output.predictions > 0.5).astype(int)

In [None]:
# Calculate metrics
# accuracy = accuracy_score(y_test_BERT, preds)
# precision = precision_score(y_test_BERT, preds)
# recall = recall_score(y_test_BERT, preds)
# f1 = f1_score(y_test_BERT, preds)
# roc_auc = roc_auc_score(y_test_BERT, preds_output.predictions[:, 1])

accuracy = accuracy_score(y_test_BERT, preds)
precision = precision_score(y_test_BERT, preds, average='micro')
recall = recall_score(y_test_BERT, preds, average='micro')
f1 = f1_score(y_test_BERT, preds, average='micro')
roc_auc = roc_auc_score(y_test_BERT, preds_output.predictions, average='micro')

In [None]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Accuracy: 0.9241
Precision: 0.8290
Recall: 0.7079
F1-Score: 0.7637
ROC-AUC: 0.9893
