In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm

In [19]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [20]:
def train_model(model, train_loader, val_loader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')

        # Training
        model.train()
        train_loss = 0
        train_steps = 0

        for batch in tqdm(train_loader, desc='Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1

        avg_train_loss = train_loss / train_steps
        print(f'Average training loss: {avg_train_loss:.4f}')

        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0
        val_accuracy = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                val_accuracy += (predictions == labels).sum().item()
                val_steps += 1

        avg_val_loss = val_loss / val_steps
        accuracy = val_accuracy / (val_steps * train_loader.batch_size)
        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Validation accuracy: {accuracy:.4f}')

In [21]:
def main():
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')

In [22]:
import pandas as pd
df = pd.read_csv(
    'C:/Users/Nilofar/Desktop/twon-hate-speech-detection/Data/dataset.csv')

In [15]:
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,post_id,tweet_text,key_features,target,label,annotator_1_label,annotator_1_target,annotator_2_label,annotator_2_target,annotator_3_label,annotator_3_target
3,3,1179088797964763136_twitter,<user> i am bit confused coz chinese ppl can n...,"['ching', 'chong']",Asian,hatespeech,hatespeech,Asian,offensive,Asian,hatespeech,Asian
4,4,1179085312976445440_twitter,this bitch in whataburger eating a burger with...,"['bitch', 'i', 'hate', 'white', 'bitches']","Caucasian, Women",hatespeech,hatespeech,"Caucasian, Women",hatespeech,"Women, Caucasian",offensive,"Women, Caucasian"
5,5,13851720_gab,laura loomer raped me while screaming at me in...,"['disgusting', 'kike', 'language', 'exterminat...",Jewish,hatespeech,hatespeech,Jewish,hatespeech,Jewish,hatespeech,Jewish
6,6,24198545_gab,and this is why i end up with nigger trainee d...,"['nigger', 'trainee', 'who', 'can', 'not', 'sp...",African,hatespeech,hatespeech,African,hatespeech,African,offensive,African
7,7,14567516_gab,nogs jews and dykes how enriching,"['nogs', 'jews', 'and', 'dykes']","African, Homosexual, Jewish",offensive,offensive,"African, Jewish, Homosexual",hatespeech,"African, Jewish, Homosexual",offensive,Jewish


In [24]:
torch.cuda.is_available()

True

In [26]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Convert to lists to ensure indexing works correctly
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Verify data integrity
        assert len(self.texts) == len(
            self.labels), "Texts and labels must have the same length"

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError(
                f"Index {idx} out of bounds for dataset of size {len(self.texts)}")

        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def train_model(model, train_loader, val_loader, device, le, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    best_accuracy = 0

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')

        # Training
        model.train()
        train_loss = 0
        train_steps = 0

        for batch in tqdm(train_loader, desc='Training'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1

        avg_train_loss = train_loss / train_steps
        print(f'Average training loss: {avg_train_loss:.4f}')

        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0
        all_predictions = []
        all_true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_true_labels.extend(labels.cpu().numpy())
                val_steps += 1

        # Calculate metrics
        avg_val_loss = val_loss / val_steps
        accuracy = (np.array(all_predictions) ==
                    np.array(all_true_labels)).mean()

        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Validation accuracy: {accuracy:.4f}')
        print('\nClassification Report:')
        print(classification_report(all_true_labels, all_predictions,
                                    target_names=le.classes_))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'New best model saved with accuracy: {accuracy:.4f}')


def main():
    try:
        # Check if CUDA is available
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f'Using device: {device}')

        # Load and preprocess data
        print("Loading dataset...")
        df = pd.read_csv(
            'C:/Users/Nilofar/Desktop/twon-hate-speech-detection/Data/dataset.csv')

        # Data validation
        required_columns = ['tweet_text', 'label']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(
                f"Dataset must contain columns: {required_columns}")

        # Remove any NaN values
        print("Cleaning dataset...")
        df = df.dropna(subset=['tweet_text', 'label'])

        # Reset index after dropping NaN values
        df = df.reset_index(drop=True)

        print(f"Dataset shape after cleaning: {df.shape}")

        # Convert labels to numerical values
        le = LabelEncoder()
        df['label'] = le.fit_transform(df['label'])

        # Print class distribution
        print("\nClass Distribution:")
        for i, label in enumerate(le.classes_):
            count = len(df[df['label'] == i])
            print(f"{label}: {count} samples")

        # Split the dataset
        print("\nSplitting dataset...")
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df['tweet_text'],
            df['label'],
            test_size=0.2,
            random_state=42,
            stratify=df['label']
        )

        # Initialize tokenizer and model
        print("Initializing BERT model and tokenizer...")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        num_labels = 3
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=num_labels
        )

        # Create datasets
        print("Creating datasets...")
        train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
        val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

        # Create dataloaders
        print("Creating dataloaders...")
        train_loader = DataLoader(
            train_dataset,
            batch_size=16,
            shuffle=True
        )
        val_loader = DataLoader(
            val_dataset,
            batch_size=16
        )

        # Move model to GPU if available
        model.to(device)

        # Train the model
        print("\nStarting training...")
        train_model(model, train_loader, val_loader, device, le)

        # Save the final model and tokenizer
        print("\nSaving model and tokenizer...")
        model.save_pretrained('fine_tuned_bert')
        tokenizer.save_pretrained('fine_tuned_bert')
        np.save('label_classes.npy', le.classes_)

        print("Training completed successfully!")

    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        print("\nFull error details:")
        import traceback
        traceback.print_exc()


if __name__ == '__main__':
    main()

Using device: cuda
Loading dataset...
Cleaning dataset...
Dataset shape after cleaning: (20148, 12)

Class Distribution:
hatespeech: 6234 samples
normal: 8153 samples
offensive: 5761 samples

Splitting dataset...
Initializing BERT model and tokenizer...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Creating datasets...
Creating dataloaders...





Starting training...

Epoch 1/3


Training: 100%|██████████| 1008/1008 [1:17:15<00:00,  4.60s/it]


Average training loss: 0.8189


Validation: 100%|██████████| 252/252 [00:45<00:00,  5.58it/s]


Average validation loss: 0.7526
Validation accuracy: 0.6739

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.73      0.76      0.74      1247
      normal       0.66      0.81      0.73      1631
   offensive       0.60      0.39      0.48      1152

    accuracy                           0.67      4030
   macro avg       0.67      0.65      0.65      4030
weighted avg       0.67      0.67      0.66      4030

New best model saved with accuracy: 0.6739

Epoch 2/3


Training: 100%|██████████| 1008/1008 [2:02:36<00:00,  7.30s/it]  


Average training loss: 0.6548


Validation: 100%|██████████| 252/252 [01:49<00:00,  2.30it/s]


Average validation loss: 0.7476
Validation accuracy: 0.6792

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.78      0.71      0.74      1247
      normal       0.73      0.73      0.73      1631
   offensive       0.52      0.57      0.54      1152

    accuracy                           0.68      4030
   macro avg       0.68      0.67      0.67      4030
weighted avg       0.69      0.68      0.68      4030

New best model saved with accuracy: 0.6792

Epoch 3/3


Training: 100%|██████████| 1008/1008 [1:25:58<00:00,  5.12s/it]


Average training loss: 0.4856


Validation: 100%|██████████| 252/252 [01:19<00:00,  3.16it/s]


Average validation loss: 0.8852
Validation accuracy: 0.6516

Classification Report:
              precision    recall  f1-score   support

  hatespeech       0.73      0.74      0.74      1247
      normal       0.76      0.62      0.68      1631
   offensive       0.48      0.60      0.53      1152

    accuracy                           0.65      4030
   macro avg       0.66      0.65      0.65      4030
weighted avg       0.67      0.65      0.66      4030


Saving model and tokenizer...
Training completed successfully!
