# Required libraries

In [17]:
import torch
from torch.utils.data import DataLoader

from tqdm import tqdm

from datasets import Dataset
from typing import Dict, Optional, List
import logging

from torch.optim import AdamW

from transformers import get_scheduler

import nltk
from nltk.corpus import wordnet
from transformers import MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer
import random
import logging
from tqdm import tqdm

# File to import model and dataset
from HateXplain_dataset import HateXplainDataset
from HateXplain_model import HateXplainModel
from utils import f1, acc, custom_collate_fn, train_cycle

# For Test the code rapidly

In [18]:
SUBSAMPLE = True

# Load HateXplain dataset

In [19]:
"""load the dataset"""
# Create an instance of the class
hatexplain = HateXplainDataset()

# Load the dataset (this will also process the labels)
HateXplain_dataset = hatexplain.load_HateXplain()

Attempting to load HateXplain dataset...
Processed 15383 examples from train split
Processed 1924 examples from test split
Dataset loaded successfully


In [20]:
# Print statistics about the dataset
hatexplain.dataset = HateXplain_dataset
hatexplain._processed_dataset = HateXplain_dataset
hatexplain.print_dataset_statistics()


Dataset Statistics:

TRAIN Split Statistics:
------------------------------
Total examples: 15383

Label Distribution:
Hate Speech/Offensive: 9132 (59.36%)
Normal: 6251 (40.64%)

Target Group Distribution:
None: 5509 (35.81%)
African: 2335 (15.18%)
Homosexual: 1161 (7.55%)
Islam: 1156 (7.51%)
Jewish: 1011 (6.57%)
Women: 898 (5.84%)
Other: 883 (5.74%)
Arab: 603 (3.92%)
Refugee: 529 (3.44%)
Caucasian: 386 (2.51%)
Asian: 269 (1.75%)
Men: 236 (1.53%)
Hispanic: 187 (1.22%)
Disability: 53 (0.34%)
Christian: 52 (0.34%)
Heterosexual: 31 (0.20%)
Minority: 24 (0.16%)
Economic: 23 (0.15%)
Indian: 15 (0.10%)
Hindu: 11 (0.07%)
none: 7 (0.05%)
Indigenous: 2 (0.01%)
Bisexual: 1 (0.01%)
Buddhism: 1 (0.01%)

Text Length Statistics:
Average length: 23.47 words
Maximum length: 165 words
Minimum length: 2 words

Hate Speech Distribution by Target Group:
African: 2125 hate speech examples (91.01% of this target)
None: 1099 hate speech examples (19.95% of this target)
Jewish: 939 hate speech examples (92.8

In [21]:
# process the dataset by removing small groups (less than 100 samples on train set)
HateXplain_processed_dataset = hatexplain.remove_small_group()

Removing target groups with fewer than {'train': 100, 'test': 20} occurrences for training data...
Processing train split...
Removed 220 examples from train split.
Remaining examples in train split: 15163
Processing test split...
Removed 25 examples from test split.
Remaining examples in test split: 1899


In [22]:
if SUBSAMPLE:
    #Take only 100 samples for rapid training
    HateXplain_processed_dataset['train'] = HateXplain_processed_dataset['train'].shuffle(seed=42).select(range(100))
    HateXplain_processed_dataset['test'] = HateXplain_processed_dataset['test'].shuffle(seed=42).select(range(20))

# Load Pre-trained model bert-base-uncased-hatexplain

In [23]:
model_HateXplain_BERT = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

Using device: cpu


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Tokenize HateXplain dataset

In [24]:
'''
 apply the function to all the elements in the dataset (individually or in batches)
 https://huggingface.co/docs/datasets/v1.11.0/package_reference/main_classes.html?highlight=dataset%20map#datasets.Dataset.map
 batch mode is very powerful. It allows you to speed up processing
 more info here: https://huggingface.co/docs/datasets/en/about_map_batch
'''

tokenized_HateXplain = {
    split: HateXplain_processed_dataset[split].map(
        lambda x: model_HateXplain_BERT.tokenize_function(x, 'text'),
        batched=True
    )
    for split in ['train', 'test']
}

print(tokenized_HateXplain)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

{'train': Dataset({
    features: ['text', 'labels', 'target_group', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
}), 'test': Dataset({
    features: ['text', 'labels', 'target_group', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 20
})}


### Create Dataloader to evaluate the model with the dataset

In [25]:
"""
Dataloader only for the test set
because the model is already trained.
"""
eval_dataset = tokenized_HateXplain['test']


In [26]:
# create a DataLoader for your test dataset so you can iterate over batches of data:
test_dataloader = DataLoader(
    eval_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=custom_collate_fn
)


### Test the model on the test dataset

In [27]:
# test on dataset
model_HateXplain_BERT.evaluate_BERT_with_bias(test_dataloader)

eval Loss: 0.5343,  ACC: 0.8750, F1-weighted: 0.7949
Overall Accuracy: 0.8000
Overall F1 Score: 0.7090
Accuracy for target group 'Men': 1.0000
F1 Score for target group 'Men': 1.0000
Confusion Matrix for 'Men':
[[2]]

Accuracy for target group 'Asian': 1.0000
F1 Score for target group 'Asian': 1.0000
Confusion Matrix for 'Asian':
[[1]]

Accuracy for target group 'None': 0.8571
F1 Score for target group 'None': 0.4615
Confusion Matrix for 'None':
[[6 1]
 [0 0]]

Accuracy for target group 'Homosexual': 0.6667
F1 Score for target group 'Homosexual': 0.4000
Confusion Matrix for 'Homosexual':
[[0 0]
 [1 2]]

Accuracy for target group 'African': 0.6667
F1 Score for target group 'African': 0.5556
Confusion Matrix for 'African':
[[1 0 0]
 [0 1 0]
 [1 0 0]]

Accuracy for target group 'Arab': 0.0000
F1 Score for target group 'Arab': 0.0000
Confusion Matrix for 'Arab':
[[0 1]
 [0 0]]

Accuracy for target group 'Refugee': 1.0000
F1 Score for target group 'Refugee': 1.0000
Confusion Matrix for 'Ref



# Data augmentation

In [28]:
class DataAugmentor:
    def __init__(
        self,
        device: Optional[str] = None,
        use_back_translation: bool = False,
        use_synonym_replacement: bool = False,
        use_random_deletion: bool = False,
        use_paraphrasing: bool = False
    ):
        """
        Initialize the DataAugmentor with various augmentation techniques.

        Args:
            device: Device to run models on ('cuda' or 'cpu')
            use_back_translation: Whether to use back-translation
            use_synonym_replacement: Whether to use synonym replacement
            use_paraphrasing: Whether to use paraphrasing
        """
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')

        # Download required NLTK data
        try:
            nltk.data.find('corpora/wordnet')
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('wordnet')
            nltk.download('punkt')

        # Initialize back-translation models if enabled
        if use_back_translation:
            try:
                print("Initializing translation models...")
                self.en_fr_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr').to(self.device)
                self.fr_en_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-fr-en').to(self.device)
                self.en_fr_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
                self.fr_en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fr-en')
                print("Translation models initialized successfully")
            except Exception as e:
                print(f"Error initializing translation models: {str(e)}")
                use_back_translation = False

        # Initialize paraphrasing model if enabled
        if use_paraphrasing:
            try:
                print("Initializing T5 model...")
                self.t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(self.device)
                self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
                print("T5 model initialized successfully")
            except Exception as e:
                print(f"Error initializing T5 model: {str(e)}")
                use_paraphrasing = False

        self.use_back_translation = use_back_translation
        self.use_synonym_replacement = use_synonym_replacement
        self.use_random_deletion = use_random_deletion
        self.use_paraphrasing = use_paraphrasing

    def get_synonyms(self, word: str) -> List[str]:
        """Get synonyms for a word using WordNet."""
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ')
                if synonym != word and synonym not in synonyms:
                    synonyms.append(synonym)
        return synonyms

    def synonym_replacement(self, text: str, n: int = 1) -> str:
        """Replace n random words with their synonyms."""
        words = text.split()
        n = min(n, len(words))
        new_words = words.copy()

        random_word_list = list(set([word for word in words if len(word) > 3]))
        random.shuffle(random_word_list)

        num_replaced = 0
        for random_word in random_word_list:
            synonyms = self.get_synonyms(random_word)
            if len(synonyms) > 0:
                synonym = random.choice(synonyms)
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break

        return ' '.join(new_words)

    def back_translate(self, text: str) -> str:
        """Translate text to French and back to English."""
        try:
            # English to French
            inputs = self.en_fr_tokenizer(text, return_tensors="pt", padding=True).to(self.device)
            translated = self.en_fr_model.generate(**inputs)
            french = self.en_fr_tokenizer.decode(translated[0], skip_special_tokens=True)

            # French to English
            inputs = self.fr_en_tokenizer(french, return_tensors="pt", padding=True).to(self.device)
            translated = self.fr_en_model.generate(**inputs)
            english = self.fr_en_tokenizer.decode(translated[0], skip_special_tokens=True)

            return english
        except Exception as e:
            print(f"Error in back translation: {str(e)}")
            return text

    def paraphrase(self, text: str) -> str:
        """Generate a paraphrase using T5."""
        try:
            input_text = f"paraphrase: {text}"
            inputs = self.t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(self.device)
            outputs = self.t5_model.generate(
                inputs,
                max_length=512,
                num_beams=4,
                no_repeat_ngram_size=2,
                early_stopping=True
            )
            return self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            print(f"Error in paraphrasing: {str(e)}")
            return text

    def random_deletion(self, text: str, p: float = 0.1) -> str:
        """
        Randomly delete words from the text with probability p.

        Args:
            text: The input text
            p: Probability of deleting each word

        Returns:
            Text with randomly deleted words
        """
        try:
            # Tokenize the text
            words = text.split()

            # Keep words with probability 1-p
            new_words = [word for word in words if random.random() > p]

            # If all words were deleted, keep at least one word
            if len(new_words) == 0:
                new_words = [random.choice(words)]

            return ' '.join(new_words)
        except Exception as e:
            print(f"Error in random deletion: {str(e)}")
        return text

    def augment_text(self, text: str) -> List[str]:
        """Apply all enabled augmentation techniques to the text."""
        augmented_texts = []

        if self.use_back_translation:
            augmented_texts.append(self.back_translate(text))

        if self.use_synonym_replacement:
            # Use both regular synonym replacement
            augmented_texts.append(self.synonym_replacement(text, n=2))

        if self.use_random_deletion:
            # Use random deletion
            augmented_texts.append(self.random_deletion(text, p=0.1))

        if self.use_paraphrasing:
            augmented_texts.append(self.paraphrase(text))

        return augmented_texts

    def augment_dataset(
        self,
        dataset: Dict[str, List],
        target_groups,
        texts_to_augment,
        indices_to_augment
    ) -> Dict[str, List]:
        """
        Augment the dataset for specified target groups.

        Args:
            dataset: Dictionary containing 'train' and 'test' datasets
            target_groups: List of target groups to augment
            augmentation_factor: Number of augmented versions to create per example

        Returns:
            Dictionary containing augmented datasets
        """
        augmented_data = {
            'train': {
                'text': [],
                'labels': [],
                'target_group': []
            },
        }

        target_groups_name = list(target_groups.keys())
        # Process each split
        print(f"\nProcessing {'train'} split...")

        # Process augmentation in batches
        batch_size = 8
        for i in tqdm(range(0, len(texts_to_augment), batch_size), desc="Augmentation batches"):
            batch_texts = texts_to_augment[i:i + batch_size]
            batch_indices = indices_to_augment[i:i + batch_size]

            for text, idx in zip(batch_texts, batch_indices):
                augmentation_factor = target_groups[dataset['train']['target_group'][idx][0]]
                for _ in range(augmentation_factor):
                    augmented_texts = self.augment_text(text)
                    for aug_text in augmented_texts:
                        augmented_data['train']['text'].append(aug_text)
                        augmented_data['train']['labels'].append(dataset['train']['labels'][idx])
                        augmented_data['train']['target_group'].append(dataset['train']['target_group'][idx])
        # Print statistics
        original_count = len(dataset['train']['text'])
        print(f"Original examples: {original_count}")
        print(f"Total examples: {len(augmented_data['train']['text'])}")
        # Print target group distribution
        if target_groups_name:
            print("\nTarget group distribution:")
            for group in target_groups_name:
                count = augmented_data['train']['target_group'].count([group])
                print(f"{group}: {count} examples")

        return augmented_data

In [29]:
target_groups_to_augment = {
    'Men': 2, # low F1 score
    'Refugee': 2, # medium F1 score
    'Caucasian': 2 # low F1 score
}
target_group_name = list(target_groups_to_augment.keys())
print(target_group_name)

['Men', 'Refugee', 'Caucasian']


In [30]:
# Then, augment examples for target groups
print("Generating augmented examples...")
texts_to_augment = []
indices_to_augment = []
# Collect texts that need augmentation
for i in range(len(HateXplain_processed_dataset['train']['text'])):
    if len(HateXplain_processed_dataset['train']['target_group'][i]) == 0:
        continue
    if HateXplain_processed_dataset['train']['target_group'][i][0] in target_group_name:
        texts_to_augment.append(HateXplain_processed_dataset['train']['text'][i])
        indices_to_augment.append(i)
print(len(indices_to_augment))

Generating augmented examples...
11


### Train and eval with Back translation

In [31]:
# Create new datasets with augmented data

# 1. Back-translation only

model_HateXplain_back_trans = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

model_BERT_back_trans = model_HateXplain_back_trans.model

print("\nGenerating back-translation dataset...")
back_trans_augmentor = DataAugmentor(
    use_back_translation=True,
    use_synonym_replacement=False,
    use_random_deletion=False,
    use_paraphrasing=False
    )

'''back_trans_dataset = back_trans_augmentor.augment_dataset(
    dataset=HateXplain_dataset,
    target_groups=target_groups_to_augment,
)'''
back_trans_dataset = back_trans_augmentor.augment_dataset(
    dataset=HateXplain_processed_dataset,
    target_groups=target_groups_to_augment,
    texts_to_augment=texts_to_augment,
    indices_to_augment=indices_to_augment
)
augmented_back_trans = Dataset.from_dict(back_trans_dataset['train'])
tokenized_augmented_back_trans_train = augmented_back_trans.map(
    lambda x: model_HateXplain_back_trans.tokenize_function(x, "text"),
    batched=True
)
dataloader_augmented_back_trans_train = DataLoader(
    tokenized_augmented_back_trans_train,
    batch_size=16,
    shuffle=True,
    collate_fn=custom_collate_fn
)

Using device: cpu


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Generating back-translation dataset...
Initializing translation models...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mikae\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mikae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Translation models initialized successfully

Processing train split...


Augmentation batches: 100%|██████████| 2/2 [01:35<00:00, 47.82s/it]


Original examples: 100
Total examples: 22

Target group distribution:
Men: 2 examples
Refugee: 10 examples
Caucasian: 10 examples


Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [16]:
# train and evaluate the model
num_epochs = 3
num_training_steps = len(dataloader_augmented_back_trans_train)

optimizer_back_trans = AdamW(model_BERT_back_trans.parameters(), lr=5e-5)

lr_scheduler_back_trans = get_scheduler(
    name="linear", optimizer=optimizer_back_trans, num_warmup_steps=1, num_training_steps=num_training_steps
)

train_cycle(
      model_HateXplain_back_trans,
      optimizer_back_trans,
      lr_scheduler_back_trans,
      dataloader_augmented_back_trans_train,
      test_dataloader,
      n_epochs=num_epochs,
      num_training_steps=num_training_steps
)

Epoch 0 of 2


  0%|          | 0/2 [00:00<?, ?it/s]

Batch type: <class 'dict'>


 50%|█████     | 1/2 [01:18<01:18, 78.05s/it]

Batch type: <class 'dict'>


100%|██████████| 2/2 [01:37<00:00, 48.96s/it]


train Loss: 0.8755,  ACC: 0.7708, F1-weighted: 0.5983
Epoch 1 of 2


  0%|          | 0/2 [00:00<?, ?it/s]

Batch type: <class 'dict'>


 50%|█████     | 1/2 [00:58<00:58, 58.01s/it]

Batch type: <class 'dict'>


100%|██████████| 2/2 [01:19<00:00, 39.73s/it]


train Loss: 0.5070,  ACC: 0.8333, F1-weighted: 0.8125
Epoch 2 of 2


  0%|          | 0/2 [00:00<?, ?it/s]

Batch type: <class 'dict'>


 50%|█████     | 1/2 [01:50<01:50, 110.14s/it]

Batch type: <class 'dict'>


100%|██████████| 2/2 [02:12<00:00, 66.27s/it] 


train Loss: 0.4307,  ACC: 0.8854, F1-weighted: 0.7735
eval Loss: 0.5259,  ACC: 0.8125, F1-weighted: 0.7095
Overall Accuracy: 0.7000
Overall F1 Score: 0.5671
Accuracy for target group 'Men': 1.0000
F1 Score for target group 'Men': 1.0000
Confusion Matrix for 'Men':
[[2]]

Accuracy for target group 'Asian': 0.0000
F1 Score for target group 'Asian': 0.0000
Confusion Matrix for 'Asian':
[[0 1]
 [0 0]]

Accuracy for target group 'None': 0.8571
F1 Score for target group 'None': 0.4615
Confusion Matrix for 'None':
[[6 1]
 [0 0]]

Accuracy for target group 'Homosexual': 0.3333
F1 Score for target group 'Homosexual': 0.1667
Confusion Matrix for 'Homosexual':
[[0 0 0]
 [0 0 0]
 [1 1 1]]

Accuracy for target group 'African': 0.6667
F1 Score for target group 'African': 0.5556
Confusion Matrix for 'African':
[[1 0 0]
 [0 1 0]
 [1 0 0]]

Accuracy for target group 'Arab': 0.0000
F1 Score for target group 'Arab': 0.0000
Confusion Matrix for 'Arab':
[[0 1]
 [0 0]]

Accuracy for target group 'Refugee': 



### Train and eval with random deletion

In [None]:
# 3. random deletion only

model_HateXplain_rand_del = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

model_BERT_rand_del = model_HateXplain_rand_del.model

print("\nGenerating WSD-based synonym replacement dataset...")
deletion_augmentor = DataAugmentor(
    use_back_translation=False,
    use_synonym_replacement=False,
    use_random_deletion=True,
    use_paraphrasing=False)

deletion_dataset = deletion_augmentor.augment_dataset(
    dataset=HateXplain_processed_dataset,
    target_groups=target_groups_to_augment,
    texts_to_augment=texts_to_augment,
    indices_to_augment=indices_to_augment
)
augmented_deletion = Dataset.from_dict(deletion_dataset['train'])
tokenized_augmented_deletion_train = augmented_deletion.map(
    lambda x: model_HateXplain_rand_del.tokenize_function(x, "text"),
    batched=True
)
dataloader_augmented_deletion_train = DataLoader(
    tokenized_augmented_deletion_train,
    batch_size=16,
    shuffle=True,
    collate_fn=custom_collate_fn
)

In [None]:
# train and evaluate the model
num_epochs = 3
num_training_steps = len(dataloader_augmented_deletion_train)

optimizer_rand_del = AdamW(model_BERT_rand_del.parameters(), lr=5e-5)

lr_scheduler_rand_del = get_scheduler(
    name="linear", optimizer=optimizer_rand_del, num_warmup_steps=1, num_training_steps=num_training_steps
)

train_cycle(
      model_HateXplain_rand_del,
      optimizer_rand_del,
      lr_scheduler_rand_del,
      dataloader_augmented_deletion_train,
      test_dataloader,
      n_epochs=num_epochs,
      num_training_steps=num_training_steps
)

### Train and eval with Synonym

In [None]:
# 2. Synonym replacement only

model_HateXplain_synonym = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

model_BERT_synonym = model_HateXplain_synonym.model

print("\nGenerating synonym replacement dataset...")
synonym_augmentor = DataAugmentor(
    use_back_translation=False,
    use_synonym_replacement=True,
    use_random_deletion=False,
    use_paraphrasing=False
    )

synonym_dataset = synonym_augmentor.augment_dataset(
    dataset=HateXplain_processed_dataset,
    target_groups=target_groups_to_augment,
    texts_to_augment=texts_to_augment,
    indices_to_augment=indices_to_augment
)
augmented_synonym = Dataset.from_dict(synonym_dataset['train'])
tokenized_augmented_synonym_train = augmented_synonym.map(
    lambda x: model_HateXplain_synonym.tokenize_function(x, "text"),
    batched=True
)
dataloader_augmented_synonym_train = DataLoader(
    tokenized_augmented_synonym_train,
    batch_size=16,
    shuffle=True,
    collate_fn=custom_collate_fn
)

In [None]:
# train and evaluate the model
num_epochs = 3
num_training_steps = len(dataloader_augmented_synonym_train)

optimizer_synonym = AdamW(model_BERT_synonym.parameters(), lr=5e-5)

lr_scheduler_synonym = get_scheduler(
    name="linear", optimizer=optimizer_synonym, num_warmup_steps=1, num_training_steps=num_training_steps
)

train_cycle(
      model_HateXplain_synonym,
      optimizer_synonym,
      lr_scheduler_synonym,
      dataloader_augmented_synonym_train,
      test_dataloader,
      n_epochs=num_epochs,
      num_training_steps=num_training_steps
)

### Train and eval with paraphrase

In [None]:
# 4. Paraphrasing only

model_HateXplain_paraphrase = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

model_BERT_paraphrase = model_HateXplain_paraphrase.model

print("\nGenerating paraphrasing dataset...")
paraphrase_augmentor = DataAugmentor(
    use_back_translation=False,
    use_synonym_replacement=False,
    use_random_deletion=False,
    use_paraphrasing=True)

paraphrase_dataset = paraphrase_augmentor.augment_dataset(
    dataset=HateXplain_processed_dataset,
    target_groups=target_groups_to_augment,
    texts_to_augment=texts_to_augment,
    indices_to_augment=indices_to_augment
)
augmented_paraphrase = Dataset.from_dict(paraphrase_dataset['train'])
tokenized_augmented_paraphrase_train = augmented_paraphrase.map(
    lambda x: model_HateXplain_paraphrase.tokenize_function(x, "text"),
    batched=True
)
dataloader_augmented_paraphrase_train = DataLoader(
    tokenized_augmented_paraphrase_train,
    batch_size=16,
    shuffle=True,
    collate_fn=custom_collate_fn
)

In [None]:
# train and evaluate the model
num_epochs = 3
num_training_steps = len(dataloader_augmented_paraphrase_train)

optimizer_paraphrase = AdamW(model_BERT_paraphrase.parameters(), lr=5e-5)

lr_scheduler_paraphrase = get_scheduler(
    name="linear", optimizer=optimizer_paraphrase, num_warmup_steps=1, num_training_steps=num_training_steps
)

train_cycle(
      model_HateXplain_paraphrase,
      optimizer_paraphrase,
      lr_scheduler_paraphrase,
      dataloader_augmented_paraphrase_train,
      test_dataloader,
      n_epochs=num_epochs,
      num_training_steps=num_training_steps
)

### Train and eval with all combined

In [None]:
# 5. All techniques combined

model_HateXplain_combined = HateXplainModel(
    model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain",
    num_labels=3,
    device=None,
    max_length=512,
    f1=f1,
    acc=acc
)

model_BERT_combined = model_HateXplain_combined.model

print("\nGenerating combined augmentation dataset...")
combined_augmentor = DataAugmentor(
    use_back_translation=True,
    use_synonym_replacement=True,
    use_random_deletion=True,
    use_paraphrasing=True
)

combined_dataset = combined_augmentor.augment_dataset(
    dataset=HateXplain_dataset,
    target_groups=target_groups_to_augment,
    texts_to_augment=texts_to_augment,
    indices_to_augment=indices_to_augment
)
augmented_combined = Dataset.from_dict(combined_dataset['train'])
tokenized_augmented_combined_train = augmented_combined.map(
    lambda x: model_HateXplain_combined.tokenize_function(x, "text"),
    batched=True
)
dataloader_augmented_combined_train = DataLoader(
    tokenized_augmented_combined_train,
    batch_size=16,
    shuffle=True,
    collate_fn=custom_collate_fn
)

In [None]:
# train and evaluate the model
num_epochs = 3
num_training_steps = len(dataloader_augmented_combined_train)

optimizer_combined = AdamW(model_BERT_combined.parameters(), lr=5e-5)

lr_scheduler_combined = get_scheduler(
    name="linear", optimizer=optimizer_combined, num_warmup_steps=1, num_training_steps=num_training_steps
)

train_cycle(
      model_HateXplain_combined,
      optimizer_combined,
      lr_scheduler_combined,
      dataloader_augmented_combined_train,
      test_dataloader,
      n_epochs=num_epochs,
      num_training_steps=num_training_steps
)