In [None]:
# Install required packages
!pip install transformers nltk emoji contractions beautifulsoup4 torch torchvision
!pip install -q pandas numpy scikit-learn


Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m

In [None]:
import time
import pandas as pd
import numpy as np
import tqdm
from tqdm import tqdm  # Import tqdm correctly

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import emoji
import contractions
from bs4 import BeautifulSoup
import unicodedata
from google.colab import drive
import os
import logging
import random
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Set random seeds for reproducibility
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

set_seed()

In [None]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.tweet_tokenizer = TweetTokenizer(preserve_case=False,
                                            reduce_len=True,
                                            strip_handles=True)
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """Main text cleaning function"""
        if pd.isna(text):
            return ""

        text = str(text)

        # Remove HTML
        text = BeautifulSoup(text, 'html.parser').get_text()

        # Handle emojis
        text = emoji.demojize(text)

        # Remove non-ASCII characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # Replace contractions
        text = contractions.fix(text)

        # Remove accented characters
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # Basic cleaning
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\brt\b|\bcc\b', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#(\w+)', r'\1', text)

        # Remove special characters but keep sentence structure
        text = re.sub(r'[^\w\s.,!?]', ' ', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize_and_lemmatize(self, text):
        """Tokenize and lemmatize text while preserving sentence structure"""
        tokens = self.tweet_tokenizer.tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token)
                 for token in tokens
                 if token not in self.stop_words or token in ['!', '?', '.', ',']]
        return ' '.join(tokens)

    def full_preprocessing(self, text):
        """Apply all preprocessing steps"""
        text = self.clean_text(text)
        text = self.tokenize_and_lemmatize(text)
        return text


In [None]:
class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
class ToxicityClassifier:
    def __init__(self, model_path=None, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = None
        self.preprocessor = TextPreprocessor()

        if model_path and os.path.exists(model_path):
            self.load_model(model_path)
        else:
            self.model = BertForSequenceClassification.from_pretrained(
                'bert-base-uncased',
                num_labels=3
            ).to(self.device)

    def save_model(self, save_path):
        """Save the model and tokenizer"""
        os.makedirs(save_path, exist_ok=True)
        model_dict = {
            'state_dict': self.model.state_dict(),
            'config': self.model.config
        }
        torch.save(model_dict, os.path.join(save_path, 'model.pt'))
        self.tokenizer.save_pretrained(save_path)
        logger.info(f"Model saved to {save_path}")

    def load_model(self, load_path):
        """Load the saved model"""
        model_dict = torch.load(os.path.join(load_path, 'model.pt'))
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            config=model_dict['config']
        )
        self.model.load_state_dict(model_dict['state_dict'])
        self.model.to(self.device)
        logger.info(f"Model loaded from {load_path}")

    def train(self, train_data, val_data, num_epochs=3, batch_size=16, learning_rate=2e-5):
        """Train the model with progress monitoring"""
        # Create datasets
        train_dataset = ToxicityDataset(
            train_data['processed_tweet'].values,
            train_data['label'].values,
            self.tokenizer
        )
        val_dataset = ToxicityDataset(
            val_data['processed_tweet'].values,
            val_data['label'].values,
            self.tokenizer
        )

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_dataloader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        best_val_f1 = 0
        training_stats = []

        print(f"Starting training with {num_epochs} epochs...")
        print(f"Total training samples: {len(train_dataset)}")
        print(f"Total validation samples: {len(val_dataset)}")
        print(f"Batch size: {batch_size}")
        print("-" * 60)

        for epoch in range(num_epochs):
            print(f'\nEpoch {epoch + 1}/{num_epochs}')
            epoch_start_time = time.time()

            # Training
            self.model.train()
            total_loss = 0
            batch_losses = []

            # Progress bar for training batches
            progress_bar = tqdm(train_dataloader, desc="Training", leave=True)

            for batch_idx, batch in enumerate(progress_bar):
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_loss += loss.item()
                batch_losses.append(loss.item())

                # Update progress bar with current loss
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{total_loss/(batch_idx+1):.4f}'
                })

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

            avg_train_loss = total_loss / len(train_dataloader)

            # Validation
            print("\nRunning validation...")
            val_preds, val_true = self.evaluate(val_dataloader)
            val_report = classification_report(val_true, val_preds, output_dict=True)
            val_f1 = val_report['weighted avg']['f1-score']
            val_accuracy = accuracy_score(val_true, val_preds)

            # Calculate epoch time
            epoch_time = time.time() - epoch_start_time

            # Store statistics
            epoch_stats = {
                'epoch': epoch + 1,
                'training_loss': avg_train_loss,
                'val_f1': val_f1,
                'val_accuracy': val_accuracy,
                'epoch_time': epoch_time
            }
            training_stats.append(epoch_stats)

            # Print epoch summary
            print(f"\nEpoch {epoch + 1} Summary:")
            print(f"Average training loss: {avg_train_loss:.4f}")
            print(f"Validation F1 Score: {val_f1:.4f}")
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            print(f"Epoch completed in {epoch_time:.2f} seconds")

            # Save best model
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                self.save_model('best_model')
                print(f'New best model saved with F1: {val_f1:.4f}')

            print("-" * 60)

        # Print training summary
        print("\nTraining completed!")
        print("\nTraining Statistics Summary:")
        for stat in training_stats:
            print(f"\nEpoch {stat['epoch']}:")
            print(f"Training Loss: {stat['training_loss']:.4f}")
            print(f"Validation F1: {stat['val_f1']:.4f}")
            print(f"Validation Accuracy: {stat['val_accuracy']:.4f}")
            print(f"Epoch Time: {stat['epoch_time']:.2f} seconds")

    def evaluate(self, dataloader):
        """Evaluate the model with progress bar"""
        self.model.eval()
        predictions = []
        actual_labels = []

        progress_bar = tqdm(dataloader, desc="Evaluating", leave=True)

        with torch.no_grad():
            for batch in progress_bar:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels']

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                _, preds = torch.max(outputs.logits, dim=1)
                predictions.extend(preds.cpu().tolist())
                actual_labels.extend(labels.tolist())

        return predictions, actual_labels

    def predict(self, text):
        """Predict toxicity class for a single text"""
        self.model.eval()

        # Preprocess text
        processed_text = self.preprocessor.full_preprocessing(text)

        encoding = self.tokenizer.encode_plus(
            processed_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

        return preds.item()



In [None]:
def prepare_data(base_path):
    """Prepare and preprocess the data"""
    # Read and combine data
    logger.info("Reading and preprocessing data...")

    labeled_data = pd.read_csv(os.path.join(base_path, 'labeled_data.csv'))

    # Create label column (0: hate speech, 1: offensive language, 2: neither)
    labeled_data['label'] = labeled_data.apply(
        lambda row: 0 if row['hate_speech'] > row['offensive_language'] and row['hate_speech'] > row['neither']
        else 1 if row['offensive_language'] > row['neither']
        else 2, axis=1
    )

    # Preprocess tweets
    preprocessor = TextPreprocessor()
    labeled_data['processed_tweet'] = labeled_data['tweet'].apply(preprocessor.full_preprocessing)

    # Split data
    train_size = int(0.7 * len(labeled_data))
    val_size = int(0.15 * len(labeled_data))

    train_data = labeled_data[:train_size]
    val_data = labeled_data[train_size:train_size + val_size]
    test_data = labeled_data[train_size + val_size:]

    return train_data, val_data, test_data



In [None]:
def main():
    # Mount Google Drive
    drive.mount('/content/drive')

    # Set paths
    base_path = '/content/drive/MyDrive/Twitter_Dataset_for_Toxicity_Classification'
    os.makedirs(base_path, exist_ok=True)

    # Prepare data
    train_data, val_data, test_data = prepare_data(base_path)

    # Initialize classifier
    classifier = ToxicityClassifier()

    # Train model
    logger.info("Training model...")
    classifier.train(train_data, val_data)

    # Evaluate on test set
    logger.info("\nEvaluating on test set...")
    test_dataset = ToxicityDataset(
        test_data['processed_tweet'].values,
        test_data['label'].values,
        classifier.tokenizer
    )
    test_dataloader = DataLoader(test_dataset, batch_size=16)

    test_preds, test_true = classifier.evaluate(test_dataloader)


    # Save the trained model
    classifier.save_model(os.path.join(base_path, 'best_model'))

    # Print test results
    print('\nTest Results:')
    print('Accuracy:', accuracy_score(test_true, test_preds))
    print('\nClassification Report:')
    print(classification_report(test_true, test_preds))

    # Example prediction
    example_text = "You fucking asshole fuck YOU!!!!!!"
    prediction = classifier.predict(example_text)
    print(f"\nPrediction for '{example_text}': {prediction}")

if __name__ == "__main__":
    main()

Mounted at /content/drive


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training with 3 epochs...
Total training samples: 17348
Total validation samples: 3717
Batch size: 16
------------------------------------------------------------

Epoch 1/3


Training: 100%|██████████| 1085/1085 [06:06<00:00,  2.96it/s, loss=0.0086, avg_loss=0.3320]



Running validation...


Evaluating: 100%|██████████| 233/233 [00:28<00:00,  8.26it/s]



Epoch 1 Summary:
Average training loss: 0.3320
Validation F1 Score: 0.9200
Validation Accuracy: 0.9266
Epoch completed in 394.87 seconds
New best model saved with F1: 0.9200
------------------------------------------------------------

Epoch 2/3


Training: 100%|██████████| 1085/1085 [06:15<00:00,  2.89it/s, loss=0.0258, avg_loss=0.2418]



Running validation...


Evaluating: 100%|██████████| 233/233 [00:28<00:00,  8.25it/s]



Epoch 2 Summary:
Average training loss: 0.2418
Validation F1 Score: 0.9237
Validation Accuracy: 0.9306
Epoch completed in 403.66 seconds
New best model saved with F1: 0.9237
------------------------------------------------------------

Epoch 3/3


Training: 100%|██████████| 1085/1085 [06:15<00:00,  2.89it/s, loss=1.4182, avg_loss=0.1955]



Running validation...


Evaluating: 100%|██████████| 233/233 [00:28<00:00,  8.25it/s]



Epoch 3 Summary:
Average training loss: 0.1955
Validation F1 Score: 0.9269
Validation Accuracy: 0.9298
Epoch completed in 403.40 seconds
New best model saved with F1: 0.9269
------------------------------------------------------------

Training completed!

Training Statistics Summary:

Epoch 1:
Training Loss: 0.3320
Validation F1: 0.9200
Validation Accuracy: 0.9266
Epoch Time: 394.87 seconds

Epoch 2:
Training Loss: 0.2418
Validation F1: 0.9237
Validation Accuracy: 0.9306
Epoch Time: 403.66 seconds

Epoch 3:
Training Loss: 0.1955
Validation F1: 0.9269
Validation Accuracy: 0.9298
Epoch Time: 403.40 seconds


Evaluating: 100%|██████████| 233/233 [00:28<00:00,  8.22it/s]



Test Results:
Accuracy: 0.9233458848843464

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.38      0.44       178
           1       0.95      0.97      0.96      2954
           2       0.89      0.88      0.88       586

    accuracy                           0.92      3718
   macro avg       0.78      0.74      0.76      3718
weighted avg       0.92      0.92      0.92      3718


Prediction for 'You fucking asshole fuck YOU!!!!!!': 1


In [None]:
!pip install gradio --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.7/56.7 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.8/319.8 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.3/73.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gradio as gr

def predict_toxicity(text):
  # Initialize classifier (this should be done outside the function ideally)
  # Assuming you have the 'best_model' directory where the model is saved
  classifier = ToxicityClassifier(model_path='/content/drive/MyDrive/Twitter_Dataset_for_Toxicity_Classification/best_model')

  # Make prediction using your classifier
  prediction = classifier.predict(text)

  # Map prediction to label
  label_mapping = {
      0: "Hate Speech",
      1: "Offensive Language",
      2: "Neither"
  }
  predicted_label = label_mapping.get(prediction, "Unknown")

  return predicted_label

# Create Gradio interface
iface = gr.Interface(
    fn=predict_toxicity,
    inputs=gr.Textbox(lines=2, placeholder="Enter your comment here..."),
    outputs="text",
    title="Toxicity Classification",
    description="Enter a comment to check its toxicity level."
)

# Launch the interface
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a67a0f89d48df4b5d1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


