In [None]:
# This code is originally obtained from https://github.com/paul-rottger/hatecheck-experiments and modified.

In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', None)
#import preprocessor
from html import unescape
import re
import string
import wordsegment as ws
#import emoji
ws.load() # load vocab for word segmentation

random_seed = 42

In [None]:
def regex_match_segmentation(match):
    return ' '.join(ws.segment(match.group(0)))
# Define function for cleaning text
def clean_text(text):
    
    # convert HTML codes
    text = unescape(text)
    
    # lowercase text
    text = text.lower()
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"u/[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    
    # find and split hashtags into words
    text = re.sub(r"#[A-Za-z0-9]+", regex_match_segmentation, text)

    # remove punctuation at beginning of string (quirk in Davidson data)
    text = text.lstrip("!")
    text = text.lstrip(":")
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    text = text.replace('[linebreak]', ' ')
    
    return text

## CAD

In [None]:
cad_train = pd.read_csv("../cad_naacl2021/data/cad_v1_1_train.tsv", sep="\t")
cad_valid = pd.read_csv("../cad_naacl2021/data/cad_v1_1_dev.tsv", sep="\t")
cad_test = pd.read_csv("../cad_naacl2021/data/cad_v1_1_test.tsv", sep="\t")

dfs = [cad_train, cad_valid, cad_test]

for dd in dfs: 
    dd.text = dd.text.astype(str).apply(lambda tt: clean_text(tt))
    dd.labels = dd.labels.apply(lambda x: x.split(','))
    
cad_train_hateful = cad_train.copy()
cad_valid_hateful = cad_valid.copy()
cad_test_hateful = cad_test.copy()

cad_train_abusive = cad_train.copy()
cad_valid_abusive = cad_valid.copy()
cad_test_abusive = cad_test.copy()

hate_dfs = [cad_train_hateful, cad_valid_hateful, cad_test_hateful]
abuse_dfs = [cad_train_abusive, cad_valid_abusive, cad_test_abusive]

for dd, oo in zip(hate_dfs, dfs):
    dd['label'] = oo.labels.apply(lambda x: 1 if 'IdentityDirectedAbuse' in x else 0)
    dd = dd.drop(columns=['labels'])
    
for dd, oo in zip(abuse_dfs, dfs):
    dd['label'] = oo.labels.apply(lambda x: 0 if ('Neutral' in x) or ('CounterSpeech' in x) else 1)
    dd = dd.drop(columns=['labels'])

In [None]:
cad_train_abusive.groupby('label').size().reset_index(name='counts')

In [None]:
cad_train_hateful.groupby('label').size().reset_index(name='counts')

In [None]:
cad_train_abusive.to_csv('Data/CAD_abuse/train.csv')
cad_valid_abusive.to_csv('Data/CAD_abuse/valid.csv')
cad_test_abusive.to_csv('Data/CAD_abuse/test.csv')

cad_train_hateful.to_csv('Data/CAD_hate/train.csv')
cad_valid_hateful.to_csv('Data/CAD_hate/valid.csv')
cad_test_hateful.to_csv('Data/CAD_hate/test.csv')

## Founta

For this dataset, we've already created a train-valid-test split while prepping the data for ILM. 

In [None]:
founta_train = pd.read_csv("Data/Founta/train.csv", index_col=0)
founta_valid = pd.read_csv("Data/Founta/valid.csv",index_col=0)
founta_test = pd.read_csv("Data/Founta/test.csv", index_col=0)

dfs = [founta_train, founta_valid, founta_test]

for dd in dfs:
    dd.drop(dd[dd.label == 'spam'].index, inplace=True)
    dd.text = dd.text.astype(str).apply(lambda tt: clean_text(tt))
    dd = dd.drop(columns=['count_label_votes'])

founta_train_abusive = founta_train.copy()
founta_valid_abusive = founta_valid.copy()
founta_test_abusive = founta_test.copy()

founta_train_hateful = founta_train.copy()
founta_valid_hateful = founta_valid.copy()
founta_test_hateful = founta_test.copy()

abuse_dfs = [founta_train_abusive, founta_valid_abusive, founta_test_abusive]
hate_dfs = [founta_train_hateful, founta_valid_hateful, founta_test_hateful]

for dd in abuse_dfs:
    dd.label.replace({'hateful': 1, "abusive": 1, "normal": 0}, inplace = True)

for dd in hate_dfs:
    dd.label.replace({'hateful': 1, "abusive": 0, "normal": 0}, inplace = True)


In [None]:
founta_train_abusive.groupby('label').size().reset_index(name='counts')

In [None]:
founta_train_hateful.groupby('label').size().reset_index(name='counts')

In [None]:
founta_train_abusive.to_csv('Data/Founta_abuse/train.csv')
founta_valid_abusive.to_csv('Data/Founta_abuse/valid.csv')
founta_test_abusive.to_csv('Data/Founta_abuse/test.csv')

founta_train_hateful.to_csv('Data/Founta_hate/train.csv')
founta_valid_hateful.to_csv('Data/Founta_hate/valid.csv')
founta_test_hateful.to_csv('Data/Founta_hate/test.csv')

## Davidson

In [None]:
davidson2017 = pd.read_csv('./Data/davidson2017.csv', index_col=0)
davidson2017.rename(columns={"class": "label", "tweet": "text"}, inplace=True, errors='ignore')
davidson2017 = davidson2017[['text','label']]
davidson2017.text = davidson2017.text.astype(str).apply(lambda tt: clean_text(tt))
davidson2017.label.replace({0: "hateful", 1: "offensive", 2: "neither"}, inplace = True)

davidson_train, davidson_valtest = train_test_split(davidson2017, 
                                                    test_size=0.2, 
                                                    stratify=davidson2017.label, 
                                                    random_state=123)
davidson_valid, davidson_test = train_test_split(davidson_valtest, 
                                                 test_size=0.5, 
                                                 stratify=davidson_valtest.label, 
                                                 random_state=123)

d_train_offense = davidson_train.copy()
d_valid_offense = davidson_valid.copy()
d_test_offense = davidson_test.copy()

d_train_hate = davidson_train.copy()
d_valid_hate = davidson_valid.copy()
d_test_hate = davidson_test.copy()

dfs_offense = [d_train_offense, d_valid_offense, d_test_offense]
dfs_hate = [d_train_hate, d_valid_hate, d_test_hate]

for dd in dfs_offense:
    dd.label.replace({'hateful': 1, 'offensive': 1, 'neither': 0}, inplace = True)
    
for dd in dfs_hate: 
    dd.label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)

In [None]:
d_train_offense.groupby('label').size().reset_index(name='counts')

In [None]:
d_train_hate.groupby('label').size().reset_index(name='counts')

In [None]:
d_train_offense.to_csv("Data/Davidson_abuse/train.csv")
d_valid_offense.to_csv("Data/Davidson_abuse/valid.csv")
d_test_offense.to_csv("Data/Davidson_abuse/test.csv")

d_train_hate.to_csv("Data/Davidson_hate/train.csv")
d_valid_hate.to_csv("Data/Davidson_hate/valid.csv")
d_test_hate.to_csv("Data/Davidson_hate/test.csv")