In [None]:
from detect_adversarial import *
from splitter import *
from viper import *
import pandas as pd
from tqdm import tqdm
from nltk.metrics import edit_distance

# target model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# substitute model
from substitute import *

In [None]:
# load target model
# tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
# model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")

tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/english-abusive-MuRIL")
model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/english-abusive-MuRIL")

In [None]:
subst_model = load_substitution_model("robust")

In [None]:
def deobfuscate_adversarial_example(adv):
    clean_words = []
    split_words = split_text_to_strings(adv)

    for word in tqdm(split_words, desc="Deobfuscating words"):
        if is_valid_word_light(word):
            clean_words.append(word)
        else:
            all_splits = split_string_to_substrings(word)
            
            substitutions = []
            scores = []
            max_substitution = word
            max_score = float('-inf')
            for split in all_splits:
                options = substitute_split(split, subst_model)

                if options:
                    for substitution, score in options:
                        if score > max_score:
                            max_substitution = substitution
                            max_score = score
                        if not is_word_exists(substitution):
                            continue
                        substitutions.append(substitution)
                        scores.append(score)

            if not substitutions:
                # fail to find a valid clean word - get the most likely which is not necessarily valid
                clean_words.append(max_substitution)
                continue
            
            # pick the substitution with the maxumim score among all valid words
            likely_substitution = substitutions[scores.index(max(scores))]
            clean_words.append(likely_substitution)
    
    # attach the strings in the list clean_words to one string
    clean_text = ''.join(clean_words)
    return clean_text

In [None]:
file_path = "data/HateSpeechDatasetBalanced_filtered.csv"
df = pd.read_csv(file_path)

In [None]:
# Take a random example from the DataFrame
random_example = df.sample(n=1)
content = random_example["Content"].values[0]
label = random_example["Label"].values[0]
print(content)

adversarial = viper(content, 0.1)
print(adversarial)

clean = deobfuscate_adversarial_example(adversarial)
print(clean)

In [None]:
def predict_hateful(text):
    input = tokenizer(text, return_tensors="pt")
    output = model(**input)
    classification = output.logits.argmax(dim=1).item()
    return classification

In [None]:
# Full pipeline
def full_pipline(p, examples):
    file_path = "data/HateSpeechDatasetBalanced_filtered.csv"
    df = pd.read_csv(file_path)

    model_orig_accuracy = 0
    model_adversarial_accuracy = 0
    model_clean_accuracy = 0

    n_examples = 0
    avrg_orig_adv_dist = 0
    avrg_orig_clean_dist = 0
    adversarials = 0
    success = 0
    for i, row in df.iterrows():
        if i in [16, 38]:
            continue
        if n_examples == examples:
            break
        print(i)
        n_examples += 1
        orig = row["Content"]
        adv = viper(orig, p)
        orig_adv_dist = edit_distance(orig, adv)
        avrg_orig_adv_dist += orig_adv_dist

        orig_pred = predict_hateful(orig)
        model_orig_accuracy += orig_pred
        adv_pred = predict_hateful(adv)
        model_adversarial_accuracy += adv_pred

        clean = deobfuscate_adversarial_example(adv)
        orig_clean_dist = edit_distance(orig, clean)
        avrg_orig_clean_dist += orig_clean_dist
        clean_pred = predict_hateful(clean)
        model_clean_accuracy += clean_pred

        if orig_pred == 1 and \
            adv_pred == 0:
            adversarials += 1
            if clean_pred == 1:
                success += 1

                print(f"Original:    {orig}")
                print(f"Adversarial: {adv}")
                print(f"Clean:       {clean}")

    avrg_orig_adv_dist /= n_examples
    avrg_orig_clean_dist /= n_examples
    cleaning_average_distance = avrg_orig_adv_dist - avrg_orig_clean_dist

    print(f"Average original-adversarial edit distance: {avrg_orig_adv_dist}")
    print(f"Average original-clean edit distance: {avrg_orig_clean_dist}")
    print(f"Cleaning average distance: {cleaning_average_distance}")
    
    print(f"Adversarial examples generated: {adversarials}")
    print(f"Successful deobfuscations: {success}")

    print(f"Model original accuracy: {model_orig_accuracy / n_examples}")
    print(f"Model adversarial accuracy: {model_adversarial_accuracy / n_examples}")
    print(f"Model clean accuracy: {model_clean_accuracy / n_examples}")


In [None]:
full_pipline(0.2, 100)