In [1]:
# IMPORTS 
import os
import numpy as np
import random as rn
import pandas as pd
from hate_bert_helper import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# PARAMETERS SETUP

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

rn.seed(501)
np.random.seed(501)
torch.manual_seed(501)
torch.cuda.manual_seed(501)


MAX_LEN = 128 # max lengrh of a sentence, fed into the network
hatebert_model_path = "./models/hate_bert"

NVIDIA GeForce GTX 980 Ti


# Experiments setup

In [3]:
dataset_definitions = [
    #TOXIC
    [(1, "toxic", "other"),
    (1, "severe_toxic", "other")],
    #HATE
    [(1, "identity_hate", "other"),
    (2, "hate_speech", "neither"),
    (3, "hate", "none"),
    (7, "hateful", "normal"),
    (10, "hateful", "none"),
    (11, "hateful", "none"),
    (12, "hate", "noHate"),
    (16, "hateful", "non-hateful"),
    (18, "hateful", "normal"),
    (21, "hatespeech", "normal"),
    (25, "hate", "nothate")],
    #ABUSIVE
    [(7, "abusive", "normal"),
    (18, "abusive", "normal")],
    #AGGRESSIVE
    [(17, "covertly-aggressive", "non-aggressive"),
    (17, "overtly-aggressive", "non-aggressive")],
    #OFFENSIVE
    [(2, "offensive_language", "neither"),
    (3, "offensive", "none"),
    (15, "offensive", "non-offensive"),
    (21, "offensive", "normal")],
    #SEXISM
    [(4, "sexism", "none"),
    (9, "sexist", "none"),
    (29, "sexism", "none"),
    (30, "sexism", "neither")],
    #CYBERBULLYING
    [(6, "cyberbullying", "none"),
    (28, "cyberbullying", "none")],
    #SPAM
    [(7, "spam", "normal"),
    (18, "spam", "normal")],
    #RELIGIOUS
    [(9, "religious", "none")],
    #HARRASMENT
    [(19, "harrasment", "non-harrasment")],
    #OBSCENE
    [(1, "obscene", "other")],
    #INSULT
    [(1, "insult", "other")],
    #HOMOPHOBIA
    [(9, "homophobic", "none")],
    #RACIST
    [(9, "racist", "none")],
    #VULGAR
    [(27, "vulgar", "non-vulgar")],
    #THREAT
    [(1, "threat", "other")],
    #PROFANE
    [(3, "profane", "none")]
]

In [4]:
DATASETS_PATH = "/home/slavkoz/Projects/ResearchProjects/offensive-language-organization/full_classification_dataset.csv"
datasets = pd.read_csv(DATASETS_PATH)
datasets

Unnamed: 0,corpus_id,text,label
0,30,Cisco had to deal with a fat cash payout to th...,neither
1,30,"@MadamPlumpette I'm decent at editing, no worr...",neither
2,30,@girlziplocked will read. gotta go afk for a b...,neither
3,30,guys. show me the data. show me your github. t...,neither
4,30,@tpw_rules nothings broken. I was just driving...,neither
...,...,...,...
869279,30,via @weaselzippers: Feminazi Blog Reminds Libe...,sexism
869280,30,I used to have pet bunnies. :) I named them PO...,neither
869281,30,@alex SO GROSS. feeling the urge to shower in ...,neither
869282,30,Purpose of this group is to share the types of...,neither


In [5]:
for dataset_definition_group in dataset_definitions:
    for definition in dataset_definition_group: 
        corpus_id = definition[0]
        pos_label = definition[1]
        neg_label = definition[2]
        dataset = extract_dataset(datasets, corpus_id, pos_label, neg_label)
        print(f"Training corpus {corpus_id}-{pos_label} ...", end = "")
        model_path = "./models/finetuned_model_"+str(corpus_id)+"_"+str(pos_label)
        if os.path.exists(model_path):
            print(" ... model already exists!")
        else:
            # Training model
            train_and_save(device, 
                       MAX_LEN, 
                       hatebert_model_path, 
                       model_path, 
                       dataset["text"].values, 
                       dataset["label"].values, 
                       pos_label)

Training corpus 1-toxic ... ... model already exists!
Training corpus 1-severe_toxic ... ... model already exists!
Training corpus 1-identity_hate ... ... model already exists!
Training corpus 2-hate_speech ... ... model already exists!
Training corpus 3-hate ... ... model already exists!
Training corpus 7-hateful ... ... model already exists!
Training corpus 10-hateful ... ... model already exists!
Training corpus 11-hateful ... ... model already exists!
Training corpus 12-hate ... ... model already exists!
Training corpus 16-hateful ... ... model already exists!
Training corpus 18-hateful ... ... model already exists!
Training corpus 21-hatespeech ... ... model already exists!
Training corpus 25-hate ... ... model already exists!
Training corpus 7-abusive ... ... model already exists!
Training corpus 18-abusive ... ... model already exists!
Training corpus 17-covertly-aggressive ... ... model already exists!
Training corpus 17-overtly-aggressive ... ... model already exists!
Training

In [6]:
#%%capture std_output_single_domain --no-stderr
# Check within single domains

#for dataset_definitions_list in dataset_definitions[0:8]:
#    run_pairwise_analysis(device, MAX_LEN, datasets, dataset_definitions_list)
            
#with open('std_output_single_domain.txt', 'w') as out:
#   out.write(std_output_single_domain.stdout)

In [None]:
#%%capture std_output_across_domain
sys.stdout = open("std_output_across_domain.txt", "w")
# Check across domains

print("Evaluation: \n")
sys.stdout.flush()

dataset_definitions_list = [
    #TOXIC
    (1, "severe_toxic", "other"),
    #HATE
    (2, "hate_speech", "neither"),
    #ABUSIVE
    (7, "abusive", "normal"),
    #AGGRESSIVE
    (17, "covertly-aggressive", "non-aggressive"),
    #OFFENSIVE
    (15, "offensive", "non-offensive"),
    #SEXISM
    (29, "sexism", "none"),
    #CYBERBULLYING
    (6, "cyberbullying", "none"),
    #SPAM
    (18, "spam", "normal"),
    #RELIGIOUS
    (9, "religious", "none"),
    #HARRASMENT
    (19, "harrasment", "non-harrasment"),
    #OBSCENE
    (1, "obscene", "other"),
    #INSULT
    (1, "insult", "other"),
    #HOMOPHOBIA
    (9, "homophobic", "none"),
    #RACIST
    (9, "racist", "none"),
    #VULGAR
    (27, "vulgar", "non-vulgar"),
    #THREAT
    (1, "threat", "other"),
    #PROFANE
    (3, "profane", "none")
]

run_pairwise_analysis(device, MAX_LEN, datasets, dataset_definitions_list)

sys.stdout.close()
            
#with open('std_output_across_domain.txt', 'w') as out:
#   out.write(std_output_across_domain.stdout)
#with open('std_output_across_domain.err', 'w') as out:
#   out.write(std_output_across_domain.stderr)