In [11]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [12]:
df_dict = dict()

PATH = "../0_data/main/0_raw"

for file in sorted(os.listdir(PATH)):
    if "ipynb" not in file:
        print(re.sub('\.csv$', '', file))
        df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/{file}")

  print(re.sub('\.csv$', '', file))
  df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/{file}")


bas19_es
dyn21_en
for19_pt
fou18_en
has19_hi
has20_hi
has21_hi
ken20_en
ous19_ar
ous19_fr
san20_it


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [13]:
# Dynabench 2021 / English
df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Founta 2018 / English
df_dict["fou18_en"].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

# Kennedy 2020 / English
df_dict["ken20_en"].rename(columns={"label_hate_maj": "label"}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["for19_pt"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["bas19_es"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["san20_it"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ous19_ar", "ous19_fr"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))
    
# HASOC 19, 20 and 21 / Hindi
for d in ["has19_hi", "has20_hi", "has21_hi"]:
    df_dict[d]["label"] = df_dict[d].task_2.apply(lambda x: 1 if x=="HATE" else 0)
    
# drop redundant columns
for dataset in df_dict:
    if "split" in df_dict[dataset].columns:
        df_dict[dataset] = df_dict[dataset][["text", "label", "split"]]
    else:
        df_dict[dataset] = df_dict[dataset][["text", "label"]]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)
  df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dict["fou18_en"].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 

## Clean text

In [14]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text) # format expected by XLM-T
    text = re.sub(r"http\S+",'http',text) # format expected by XLM-T
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http") # format expected by XLM-T
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Boost proportion of hate in English datasets to match Dynabench
We are doing a first phase of fine-tuning on up to 20k entries in Dynabench. Dynabench has ca. 53% hate. The other English datasets have a lower proportion of hate so we drop non-hate to make the proportion more comparable at 20k entries.

In [15]:
# boost Kennedy 2020 / English to have 50% hate (up from ca. 30%)
df_dict["ken20_en"] = pd.concat([df_dict["ken20_en"][df_dict["ken20_en"].label==1], df_dict["ken20_en"][df_dict["ken20_en"].label==0].sample(11596, random_state=123)]).sample(frac=1, random_state=123)

# boost Founta 2018 / English to have 22% hate, which is max possible (up from ca. 5%)
df_dict["fou18_en"] = pd.concat([df_dict["fou18_en"][df_dict["fou18_en"].label==1], df_dict["fou18_en"][df_dict["fou18_en"].label==0].sample(17600, random_state=123)]).sample(frac=1, random_state=123)


## Show descriptive stats

In [16]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

BAS19_ES
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
2438,@user Me flipaüòç Me tienes que ense√±ar a hacer pelo me cago en la puta JAJAJA,0
4938,¬øQue harian los hombres sin las mujeres? Pues domesticar otro animal y esta vez no ense√±arle a hablar.,1
295,"Que no es presunto üêç @user que ESCORIAüòà es CULPABLE del ASESINATO,DESAPARICION,SECUESTRO Y VIOLACI√ìN de #DianaQuer angelico üòìüå∑üíïüêæüåõ D.E.P que este ESCORIAüòàq su trastorno solo es #HijoPutismoAgudo va a pagarlo üòàüòà #TodasSomosDiana üíïüêæüåõ",0
2765,Hasta ahora me vi el cap√≠tulo de #TWD y solo tengo que decir que Eugene es un hijo de puta que merec√≠a morir en lugar de Carl. Ah y qu√© adoro a Maggie obviamente no m√°s que a Carol.,0
3865,El curioso caso del hombre que solo ve machismo si el acusado es un √°rabe.,0



DYN21_EN
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label,split
22247,"You see, children imitate society and we live in a time where falsehoods are embraced in the name of pushing a narrative. Equality of opportunity? Where?",0,test
22568,All women and men and others are looking for is someone to provide for us.,0,dev
18435,who is truly suffering? Those of you who need to be lead by leaders or those who are waiting for the next hand out jerked off expired food ?,0,train
8394,I am a big fat curry muncher and would eat it over anything!,0,train
8332,Today I feel like I have to mention how muslims are influencing the level of wonderful things in the world. By that I mean negatively of course since muslims are basically just pigs to lookat,1,train



FOR19_PT
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
1087,Desconfio que a NASA s√≥ n√£o anunciou ter descoberto vida num dos 7 planetas com receio que o Trump mandasse construir outro muro...,0
4421,"RT @user: CALMA CACHORRO, ELA AINDA N√ÉO VIROU RA√á√ÉO #PAZ http",1
260,@user Pra ser mulher hoje em dia e aguentar o movimento feminista...tem que ser muito macho. kkkkk,1
5661,@user T√¥ esperando a sapat√£o fazer,1
4230,RT @user: o cara entrevistou o @user ficou igual uma putinha concordando e agora lan√ßa essa legenda canalha. Que arrombado _,1



FOU18_EN
22565 entries, of which 4965 (22.00%) are hateful.


Unnamed: 0,text,label
10688,"#cnageeks AMERICAN GODS Character Promos Spotlight Shadow & Laura Moon, Mr. Wednesday, And Mad Sweeney http",0
16935,RT @user: Omg im so ugly April fools bitch you thought,0
38733,One (1) Gen Ad ticket for day 1 is also another option please DM/@ me if you have! Thank you! #BTSWingsTourManila http,1
20143,RT @user: Look at this fucking asshole: http http,0
64597,@user U know the episode when Jan invites him and Pam to a DINNER PARTY that is my fucking life rn pulse‚Ä¶ http,0



HAS19_HI
5983 entries, of which 746 (12.47%) are hateful.


Unnamed: 0,text,label,split
5021,"‡§Ö‡§¨ ‡§ï‡§π‡§æ‡§Å ‡§Æ‡§∞ ‡§ó‡§Ø‡§æ...‡§∂‡§æ‡§Ç‡§§‡§ø‡§¶‡•Ç‡§§ ‡§¨‡•ã‡§≤‡§®‡•á ‡§µ‡§æ‡§≤‡§æ ‡§ó‡•à‡§Ç‡§ó...‡§Ø‡§π ‡§ú‡§º‡§Æ‡§æ‡§§ ‡§ï‡§≠‡•Ä ‡§®‡§π‡•Ä ‡§π‡•ã ‡§∏‡§ï‡§§‡•Ä ‡§π‡•à ‡§≠‡§æ‡§∞‡§§ ‡§¶‡•á‡§∂ ‡§ï‡•Ä, ‡§®‡§æ ‡§π‡•ã‡§ó‡•Ä‡•§ ‡§Ø‡§π ‡§ï‡§ü‡•Å ‡§∏‡§ü‡•Ä‡§ï ‡§∏‡§ö‡•ç‡§ö‡§æ‡§à ‡§π‡•à‡•§ ‡§´‡§ø‡§∞ ‡§≠‡•Ä ‡§Ø‡§π ‡§∏‡•Å‡§Ö‡§∞ ‡§ú‡§º‡§Æ‡§æ‡§§ ‡§´‡§≤ ‡§´‡•Ç‡§≤ ‡§∞‡§π‡•Ä ‡§π‡•à ‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø ‡§ó‡§Ç‡§¶‡§ó‡•Ä ‡§∞‡§æ‡§ú‡§®‡•á‡§§‡§æ‡§ì‡§Ç ‡§ï‡•Ä ‡§µ‡§ú‡§π ‡§∏‡•á ‡§´‡•à‡§≤ ‡§∞‡§π‡•Ä ‡§π‡•à‡•§ ‡§ú‡§ø‡§∏‡§ï‡•Ä ‡§¨‡§¶‡§¨‡•Ç ‡§∏‡•á ‡§π‡§Æ‡•á‡§∂‡§æ ‡§™‡§∞‡•ç‡§Ø‡§æ‡§µ‡§∞‡§£ ‡§ñ‡§º‡§∞‡§æ‡§¨ ‡§∞‡§π‡§§‡§æ ‡§π‡•à‡•§",0,test
638,‡§ê‡§∏‡§æ ‡§Ö‡§ó‡§∞ ‡§π‡§ø‡§Ç‡§¶‡•Å‡§ì‡§Ç ‡§®‡•á ‡§ï‡§ø‡§∏‡•Ä ‡§Æ‡§ï‡§¨‡§∞‡•á ‡§™‡§∞ ‡§ï‡§ø‡§Ø‡§æ ‡§π‡•ã‡§§‡§æ ‡§§‡•ã ‡§§‡•Ç ‡§π‡•Ä ‡§∞‡§Ç‡§°‡•Ä ‡§ú‡•ã‡§∞ ‡§ú‡•ã‡§∞ ‡§∏‡•á ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ‡§§‡•Ä ‡§Ü‡§∞ ‡§è‡§∏ ‡§è‡§∏ ‡§µ‡§æ‡§≤‡•ã‡§Ç ‡§®‡•á ‡§¨‡§ú‡§∞‡§Ç‡§ó ‡§¶‡§≤ ‡§µ‡§æ‡§≤‡•ã‡§Ç ‡§®‡•á ‡§ï‡§ø‡§Ø‡§æ ‡§î‡§∞ ‡§∏‡•Ç‡§Ö‡§∞ ‡§ï‡•Ä ‡§î‡§≤‡§æ‡§¶ ‡§¶‡•á‡§Ç ‡§§‡•Å‡§ù‡•á ‡§∂‡§∞‡§æ‡§∞‡§§‡•Ä ‡§§‡§§‡•ç‡§µ ‡§¶‡•á‡§ñ‡§®‡•á ‡§≤‡§ó‡•á ‡§π‡§∞‡§æ‡§Æ ‡§ï‡•Ä ‡§ú‡§®‡•Ä,0,train
2119,‡§ï‡§Æ‡§≤‡§®‡§æ‡§• ‡§®‡•á ‡§Æ‡§æ‡§®‡§æ~‡§ï‡§∞‡•ç‡§ú‡§Æ‡§æ‡§´‡•Ä ‡§ï‡•á ‡§®‡§æ‡§Æ ‡§™‡§∞ ‡§π‡•Å‡§Ü ‡§ò‡•ã‡§ü‡§æ‡§≤‡§æ ‡§¶‡•ã‡§∑‡§ø‡§Ø‡•ã‡§Ç ‡§™‡§∞ ‡§π‡•ã‡§ó‡•Ä ‡§ï‡§æ‡§∞‡•ç‡§∞‡§µ‡§æ‡§à ‡§ï‡§ø‡§∏‡§æ‡§®‡•ã‡§Ç ‡§ï‡•á ‡§®‡§æ‡§Æ ‡§™‡§∞ ‡§´‡§∞‡•ç‡§ú‡•Ä ‡§™‡•ç‡§∞‡§ï‡§∞‡§£ ‡§¨‡§®‡§æ‡§ï‡§∞ #‡§§‡•Ä‡§®_‡§π‡§ú‡§æ‡§∞_‡§ï‡§∞‡•ã‡§°‡§º_‡§∞‡•Å‡§™‡§è_‡§ï‡§æ_‡§ò‡•ã‡§ü‡§æ‡§≤‡§æ_‡§π‡•Å‡§Ü ‡§π‡•à #‡§ï‡§Æ‡•Ä‡§®‡•á ‡§â‡§≤‡•ç‡§≤‡•Ç ‡§Æ‡§§ ‡§¨‡§®‡§æ ‡§∏‡§¨ #‡§§‡•á‡§∞‡§æ_‡§π‡•Ä_‡§ï‡§ø‡§Ø‡§æ_‡§ß‡§∞‡§æ ‡§π‡•à ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§µ‡§æ‡§π‡•Ä ‡§ï‡•ç‡§Ø‡§æ ‡§ñ‡§æ‡§ï ‡§π‡•ã‡§ó‡•Ä ??? ‡§ú‡§¨ ‡§§‡•á‡§∞‡•Ä ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§π‡•Ä ‡§ñ‡§§‡§∞‡•á ‡§Æ‡•á‡§Ç ‡§π‡•à,0,train
3140,‡§Ø‡•á human rights ‡§î‡§∞ secularism ‡§ï‡•Ä ‡§®‡§æ‡§ú‡§æ‡§Ø‡§ú ‡§î‡§≤‡§æ‡§¶‡•á‡§Ç ‡§Ö‡§¨ ‡§≤‡§ó‡§§‡§æ ‡§π‡•à ‡§è‡§ï ‡§¶‡•Ç‡§∏‡§∞‡•á ‡§ï‡•Ä ‡§ó‡§æ‡§Ç‡§° ‡§Æ‡•á‡§Ç ‡§Æ‡•Å‡§Ç‡§π ‡§°‡§æ‡§≤ ‡§ï‡•á ‡§∏‡•ã ‡§ó‡§Ø‡•á ‡§π‡•à‡§Ç #TempleTerrorAttack,0,train
497,"‡§¨‡§π‡•Å‡§§ ‡§¨‡§π‡•Å‡§§ ‡§∂‡•Å‡§ï‡•ç‡§∞‡§ø‡§Ø‡§æ‡•§ ‡§Ø‡§π ‡§ñ‡•ç‡§Ø‡§æ‡§≤‡•á ‡§ñ‡§æ‡§Æ ‡§π‡•à,‡§ñ‡•Ç‡§¨‡§∏‡•Ç‡§∞‡§§‡•Ä ‡§ö‡•á‡§π‡§∞‡•á ‡§ï‡§æ ‡§®‡§æ‡§Æ ‡§π‡•à‡•§ ‡§ñ‡•ç‡§Ø‡§æ‡§≤ ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§π‡•ã ‡§§‡•ã,,‡§∏‡•Ä‡§∞‡§§ ‡§≠‡•Ä ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§π‡•ã‡§§‡•Ä ‡§π‡•à‡•§ ‡§®‡§à‡§Æ #‡§¨‡§ú‡§º‡•ç‡§Æ #‡§π‡§ø‡§Ç‡§¶‡•Ä_‡§∂‡§¨‡•ç‡§¶ #‡§¨‡§ú‡§º‡•ç‡§Æ_‡§è_‡§á‡§ñ‡§º‡§≤‡§æ‡§∏ @user @user @user @user @user @user @user @user",0,train



HAS20_HI
4232 entries, of which 347 (8.20%) are hateful.


Unnamed: 0,text,label,split
846,‡§ú‡•ã ‡§¶‡•á‡§∂ ‡§π‡§ø‡§§ ‡§ï‡•Ä ‡§¨‡§æ‡§§ ‡§ï‡§∞‡•á‡§ó‡§æ ‡§µ‡§π‡•Ä‡§Ç ‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§™‡§∞ ‡§∞‡§æ‡§ú ‡§ï‡§∞‡•á‡§ó‡§æ,0,train
238,RT @user: ‡§§‡§¨ ‡§§‡•ã ‡§¨‡§¨‡•Å‡§Ü ‡§≠‡•Ä ‡§¨‡•Å‡§Ü ‡§ï‡•á ‡§â‡§∏ ‡§ò‡•ã‡§ü‡§æ‡§≤‡•á ‡§ï‡•Ä ‡§ú‡§æ‡§Ç‡§ö ‡§ï‡§∞‡§æ‡§®‡•á ‡§ï‡•á ‡§ê‡§≤‡§æ‡§® ‡§ï‡§∞‡§ï‡•á ‡§ó‡§è ‡§•‡•á‡•§ ‡§≤‡•á‡§ï‡§ø‡§® ‡§π‡•Å‡§Ü ‡§ï‡•ç‡§Ø‡§æ? ‡§¨‡•Å‡§Ü ‡§î‡§∞ ‡§¨‡§¨‡•Å‡§Ü ‡§®‡•á ‡§Ü‡§ú ‡§ñ‡•Å‡§¶ ‡§π‡§æ‡§• ‡§Æ‡§ø‡§≤‡§æ ‡§≤‡§ø‚Ä¶,0,train
1342,UNSC ‡§ï‡•á ‡§¶‡•ç‡§µ‡§æ‡§∞‡§æ ‡§Ü‡§§‡§Ç‡§ï‡•Ä ‡§Æ‡§∏‡•Ç‡§¶ ‡§Ö‡§ú‡§π‡§∞ ‡§ï‡•ã global terrorist ‡§ò‡•ã‡§∑‡§ø‡§§ ‡§ï‡§ø‡§è ‡§ú‡§æ‡§®‡•á ‡§™‡§∞ ‡§Ö‡§¨ ‡§π‡•ã ‡§∏‡§ï‡§§‡§æ ‡§π‡•à ‡§ï‡§à ‡§≤‡•ã‡§ó ‡§á‡§∏‡•á ‡§≠‡•Ä BJP ‡§ï‡•Ä ‡§ö‡§æ‡§≤ ‡§¨‡§§‡§æ ‡§¶‡•á‡•§ ‡§π‡•ã ‡§∏‡§ï‡§§‡§æ ‡§π‡•à ‡§ï‡§ø ‡§Ø‡•á ‡§≤‡•ã‡§ó ‡§™‡•Ç‡§õ ‡§≤‡•á ‡§ï‡§ø ‡§ö‡•Å‡§®‡§æ‡§µ ‡§¶‡•å‡§∞‡§æ‡§® ‡§π‡•Ä ‡§ï‡•ç‡§Ø‡•ã‡§Ç UNSC ‡§®‡•á ‡§Ø‡•á ‡§ò‡•ã‡§∑‡§£‡§æ ‡§ï‡§ø‡•§ ‡§Ö‡§™‡§®‡•Ä ‡§Ü‡§¶‡§§ ‡§∏‡•á ‡§¨‡§æ‡§ú ‡§•‡•ã‡§°‡§º‡•Ä ‡§®‡§æ ‡§Ü‡§Ø‡•á‡§Ç‡§ó‡•á ‡§Ø‡•á‡•§ #BJP4India #beatAjhar ##BeatTerror #BeatPakistan #Badla,1,train
1038,"@user ‡§Ø‡•á ‡§§‡•ã ‡§¨‡§∏ ‡§á‡§§‡§®‡§æ ‡§π‡•Ä, ‡§π‡•á‡§≤‡§ø‡§ï‡•â‡§™‡•ç‡§ü‡§∞ ‡§Æ‡•á‡§Ç ‡§ú‡§¨ ‡§∏‡§æ‡§π‡•á‡§¨ ‡§ï‡•ã ‡§†‡§Ç‡§° ‡§≤‡§ó‡§§‡•Ä ‡§•‡•Ä ‡§§‡•ã ‡§µ‡•ã ‡§™‡§Ç‡§ñ‡§æ ‡§ë‡§´ ‡§ï‡§∞ ‡§¶‡•á‡§§‡•á ‡§•‡•á !! ‡§´‡•ç‡§Ø‡•Ç‡§≤ ‡§≠‡•Ä ‡§¨‡§ö ‡§ú‡§æ‡§§‡•Ä ‡§•‡•Ä ‡§î‡§∞ ‡§†‡§Ç‡§° ‡§∏‡•á ‡§≠‡•Ä ‡§õ‡•Å‡§ü‡§ï‡§æ‡§∞‡§æ üòÇüòÇüòâüòâ",0,train
3798,@user @user @user ‡§µ‡§æ‡§π ‡§∞‡•á ‡§§‡•á‡§∞‡•Ä ‡§Æ‡§æ‡§Ç ‡§ï‡•ã ‡§ö‡•ã‡§¶‡•Ç ‡§≠‡§°‡§º‡§µ‡•á ‡§µ‡§æ‡§π üòù,0,test



HAS21_HI
4594 entries, of which 566 (12.32%) are hateful.


Unnamed: 0,text,label,split
2721,‡§ï‡§≤ ‡§ï‡•ã‡§à ‡§ï‡§π ‡§∞‡§π‡§æ ‡§•‡§æ ‡§ï‡§ø ‡§¶‡•á‡§∂ ‡•™‡•¶‡•¶‡§∏‡§æ‡§≤ ‡§™‡•Ä‡§õ‡•á ‡§ö‡§≤‡§æ ‡§ó‡§Ø‡§æ... ‡§Æ‡•à‡§®‡•á ‡§ï‡§π‡§æ ‡§¨‡§∏ ‡•® ‡§∏‡§æ‡§≤ ‡§î‡§∞ ‡§∞‡•Å‡§ï‡•ã ‡§∏‡•Ä‡§ß‡•á ‡§§‡•ç‡§∞‡•á‡§§‡§æ ‡§Ø‡•Å‡§ó ‡§Æ‡•á‡§Ç ‡§π‡•ã‡§Ç‡§ó‡•á ‡§Ü‡§™! ---‡§ó‡§ú‡§¨ ‡§ö‡•Å‡§ü‡§ø‡§Ø‡§æ ‡§π‡•à‡§Ç!,0,train
2560,@user ‡§Æ‡•ã‡§¶‡•Ä ‡§§‡•Å‡§Æ ‡§á‡§∏‡•ç‡§§‡•Ä‡§´‡§æ ‡§¶‡•ã ‡§π‡§Æ ‡§§‡•Å‡§Æ‡•ç‡§π‡§æ‡§∞‡•á ‡§∏‡§æ‡§• ‡§π‡•à #ResignModi #Resign_PM_Modi #ModiHataoDeshBachao http,0,train
511,"@user @user @user @user @user @user @user @user @user ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã‡§Ç, ‡§Ü‡§ì ‡§π‡§Æ ‡§∏‡§¨ ‡§∏‡§∞‡•ç‡§µ‡§∏‡§Æ‡•ç‡§Æ‡§§‡§ø ‡§∏‡•á ‡§Æ‡§π‡§æ‡§® ‡§Ö‡§∞‡•ç‡§•‡§∂‡§æ‡§∏‡•ç‡§§‡•ç‡§∞‡•Ä, ‡§ß‡•Å‡§∞‡§Ç‡§ß‡§∞ ‡§µ‡•à‡§ú‡•ç‡§û‡§æ‡§®‡§ø‡§ï, ‡§Ö‡§Ç‡§§‡§∞‡•ç‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡•Ä‡§Ø ‡§ï‡§ø‡§∏‡§æ‡§®, ‡§≠‡§Ø‡§Ç‡§ï‡§∞ ‡§∏‡§µ‡§∞‡•ç‡§£-‡§¶‡§≤‡§ø‡§§ ‡§Æ‡§∏‡•Ä‡§π‡§æ #‡§ö‡•Å‡§¶‡§ø‡§§ ‡§∞‡§æ‡§ú ‡§ú‡•Ä ‡§ï‡•ã ‡§ñ‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏ #‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑ ‡§è‡§µ‡§Ç #‡§™‡§≤‡§ß‡§æ‡§®‡§Æ‡§Ç‡§§‡§≤‡•Ä ‡§¨‡§®‡§æ‡§è‡§Ç !!üôè ""‡§∏‡§≠‡•Ä #‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ‡§è‡§Ç ‡§∏‡§Æ‡§æ‡§™‡•ç‡§§"" #‡§â‡§¶‡§ø‡§§_‡§∞‡§æ‡§ú_‡§ï‡•ã_‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑_‡§¨‡§®‡§æ‡§ì_‡§¨‡•á @user #CovidVaccine http",1,train
2770,2 /2ll‡§ï‡§π‡§æ‡§Å ‡§π‡•à? ‡§∏‡§®‡§¶ ‡§∞‡§π‡•á ‡§ï‡§ø ‡§Ø‡§π‡•Ä ‡§Ü‡§∞‡§ú‡•á‡§°‡•Ä ‡§™‡§π‡§≤‡•á ‡§∏‡•á ‡§π‡•Ä ‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä ‡§Ü‡§∞‡§ú‡•á‡§°‡•Ä ‡§ï‡•á ‡§Ø‡•Å‡§µ‡§æ ‡§™‡•ç‡§∞‡§¶‡•á‡§∂ ‡§Ö‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑ ‡§Æ‡•Ä‡§∞‡§æ‡§® ‡§π‡•à‡§¶‡§∞ ‡§ï‡•Ä ‡§ó‡§ø‡§∞‡§´‡•ç‡§§‡§æ‡§∞‡•Ä ‡§∏‡•á ‡§≤‡•á‡§ï‡§∞ ‡§∞‡§ø‡§π‡§æ‡§à ‡§ï‡•á ‡§Æ‡•Å‡§¶‡•ç‡§¶‡•á ‡§™‡§∞ ‡§≠‡•Ä ‡§ö‡•Å‡§™ ‡§π‡•à‡•§ ‡§∏‡§µ‡§æ‡§≤ ‡§Ø‡§π‡•Ä ‡§ï‡§ø ‡§Ü‡§™ ‡§Æ‡•á‡§∞‡•á ‡§¨‡•Å‡§∞‡•á ‡§∏‡§Æ‡§Ø ‡§Æ‡•á‡§Ç ‡§Æ‡•á‡§∞‡•á ‡§∏‡§æ‡§• ‡§®‡§π‡•Ä‡§Ç ‡§§‡•ã ‡§π‡§Æ ‡§Ü‡§™‡§ï‡§æ ‡§ù‡§Ç‡§°‡§æ ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§¢‡•ã‡§è‡§Ç? @user @user #JusticeForShahabuddin,0,train
2907,‡§Ö‡§™‡§®‡•Ä ‡§™‡•Ç‡§∞‡•Ä ‡§ú‡§ø‡§Ç‡§¶‡§ó‡•Ä ‡§∏‡§ø‡§µ‡§æ‡§® ‡§ï‡•á ‡§®‡§æ‡§Æ ‡§ï‡§∞ ‡§¶‡•Ä ‡§≤‡•á‡§ï‡§ø‡§® ‡§Ö‡§™‡§®‡•á ‡§∏‡§ø‡§µ‡§æ‡§® ‡§ï‡•Ä ‡§Æ‡§ø‡§ü‡•ç‡§ü‡•Ä ‡§® ‡§Æ‡§ø‡§≤ ‡§∏‡§ï‡•Ä üò≠ #JusticeForShahabuddin,0,train



KEN20_EN
23192 entries, of which 11596 (50.00%) are hateful.


Unnamed: 0,text,label
1554,Haisha? Move to japan you fucking fag. Hurr durr im an amine. Yeah fuck off you literal veggie,1
26524,Pussy so fat the bitch gave me STDs: Sexually Transmitted Diabetesüò≠,1
2283,What up retard,1
18227,Ramazan has always proved a blessing for PAKISTAN. 27th Ramazan when Pakistan came into existence üòçüòçüáµüá∞üáµüá∞üíöüíö #ÿ™ÿ≠ŸÅ€Å_ÿ±ŸÖÿ∂ÿßŸÜ_Ÿæ€åÿßÿ±ÿß_Ÿæÿß⁄©ÿ≥ÿ™ÿßŸÜ,0
5685,COME OUT TO THE BAY AREA AND GET UR COCK SUCKED REALL GOOD FAGGOT,1



OUS19_AR
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
1221,@user @user ÿßÿÆÿ± ŸÖŸÜ Ÿäÿ™ŸÉŸÑŸÖ ÿßŸÜÿ™ŸÖ ŸäÿßÿÆŸàŸÜÿ© ŸàŸÑÿßŸÑŸÉŸÖ ÿØÿßÿ± ŸàŸÑÿß ÿ®ŸÑÿØ ÿßŸÜÿ™ŸÖ ÿÆŸÜÿßÿ≤Ÿäÿ± ÿßŸÑÿ∫ÿ±ÿ® Ÿàÿ∑Ÿáÿ±ÿßŸÜ ŸàÿßŸÑÿ¥ÿ±ŸÅ ŸàÿßŸÑÿ∫Ÿäÿ±‚Ä¶ http,0
1794,ÿßŸä ÿßŸÖÿ±ÿ£ÿ© ŸÖÿ™ÿ®ÿ±ÿ¨ÿ© ŸÜÿßÿ≤ÿπÿ© ÿ≠ÿ¨ÿßÿ®Ÿáÿß ŸÑÿßÿ™ÿ¥ÿ±ŸÅŸÜÿß ŸÉÿß ÿ±ÿ¨ÿßŸÑ ŸàŸÑŸäÿ≥ ÿ∞ŸÉŸàÿ± ŸàŸÑŸÜ ŸÜŸÅŸÉÿ± ŸÜÿ±ÿ™ÿ®ÿ∑ ÿ®Ÿáÿß ÿßÿ¥ŸàŸÅŸáÿß ŸÜÿßÿ≤ÿπŸá ÿ≠ÿ¨ÿßÿ®Ÿáÿß ŸÉŸÖŸäÿ©‚Ä¶ http,0
3186,@user Ÿäÿß ÿ≠ŸÇŸäÿ± ŸáŸäÿØÿß ÿßŸÑÿπŸáÿØ ŸäŸÑŸä ÿßŸÜÿ™ ÿ®ÿØŸÉ ŸäŸÑŸä ÿßŸÜÿ™ ÿπŸëŸÖ ÿ™ÿ®Ÿàÿ≥ ÿ±ÿßÿµ ÿπŸàŸÜ Ÿäÿß Ÿàÿßÿ∑Ÿä Ÿäÿß ÿÆŸÜÿ≤Ÿäÿ±,1
2381,ÿ™ÿÆŸäŸÑ ÿ®ŸÜŸÅÿ≥ ÿßŸÑŸàŸÇÿ™ ÿ™ŸÉŸÑŸÖ ÿÆŸÜÿ≤Ÿäÿ± ÿ®ÿßŸÑŸàÿßŸÇÿπ ŸàŸäÿ≥ÿ®ŸÉ ÿ≥ÿ®ŸàÿØÿ± ÿ®ÿßŸÑÿÆÿßÿµ ŸàŸäŸÖŸÜÿ¥ŸÜŸÉ Ÿàÿ≥ŸäŸÅ ÿü ÿßŸä ÿßŸÜÿ≠ÿ∑ÿßÿ∑ ÿ™ŸàÿµŸÑÿ™ ŸÑŸá Ÿäÿß ÿßŸÜÿß,0
2017,ŸàŸÜŸáÿßŸäÿ™ŸÉŸÖ ŸÇÿßÿØŸÖÿ© . ÿ¥ÿßÿ±ÿ®Ÿä ÿ®ŸàŸÑ ÿßŸÑÿ®ÿπŸäÿ± .,1



OUS19_FR
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
1405,@user @user un adepte du lieu commun gauchiste,0
2827,bravo au r√©alisateur de #lejouroujaibrulemoncoeur qui inclut le #harc√®lement #antisemite dans le film ¬´¬† #sale #juif¬†¬ª. une r√©alit√©.,0
2796,: [l'africain d'alg√©rie est souvent raciste sur twitter.,0
2734,une gauchiste reste une gauchiste. et pourtant elle bataill√©. http,0
1470,maitena biraben.. la rumeur raconte qu en bonne gauchiste elle pr√©voit de reverser l'int√©gralit√© de la somme √† des‚Ä¶ http,0



SAN20_IT
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label,split
4312,Vanno affondate. Hanno rotto i c.....i Aquarius vuol dettare ancora legge: carica migranti e rifiuta gli ordini libici http,1,train
4079,"@user Mi chiedo come sia possibile definire Islam religione di pace invece di mettere in guardia il mondo da questa ""cultura"" subumana #sposebambine",1,train
7379,"Business immigrati Tanti ci guadagnano: Le ONG che trasportano, Le Coop che ospitano, il caporalato, a breve l'industria con mano d'opera a basso costo, i Sindacati che da loro possono ripartire a vendere tessere, i partiti pro immigrazione che prenderanno i loro voti.",0,test
6723,"Se stranieri, delinquenti. Se italiani, derubricata a ragazzata e in premio non c'√® la medaglia di Salvini. √â l' Italietta legastellata.",0,train
4355,"Scusate, ma a proposito di #facciamocome: non potremmo semplicemente espellere anche noi tutti gli stranieri disoccupati? (cos√¨ evitiamo anche il groviglio della discriminazione per il rdc)",1,train





## Create and export splits

In [17]:
# set aside 2k from each dataset for testing and 500 for dev
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small
# and for HASOC 20 and 21 in Hindi, where test splits are given

TEST_SIZE = 2000
DEV_SIZE = 500

for dataset in df_dict:
    if "ous19_fr" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 2000, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1500.csv", index=False)
    elif "ous19_ar" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 1300, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1000, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_300.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1000.csv", index=False)
    elif "has19_hi" in dataset or "has20_hi" in dataset: # use provided test sets
        df_dict[dataset][df_dict[dataset]["split"]=="test"].to_csv(f"../0_data/main/1_clean/{dataset}/test_{len(df_dict[dataset][df_dict[dataset]['split']=='test'])}.csv", index=False)
        df_dict[dataset], devset = train_test_split(df_dict[dataset][df_dict[dataset]["split"]=="train"], test_size = 500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
    else:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = TEST_SIZE+DEV_SIZE, random_state=123)
        devset, testset = train_test_split(devtest, test_size = TEST_SIZE, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_{DEV_SIZE}.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all data that is not test or dev, so we can use it for full sample training
for dataset in df_dict:
    df_dict[dataset].to_csv(f"../0_data/main/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [18]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        
        # save all splits for English test sets
        if n<len(df_dict[dataset]) and ("dyn21" in dataset or "ken20" in dataset or "fou18" in dataset): 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
        
        # save splits up to 2k for other datasets
        elif n<len(df_dict[dataset]) and n<=2000: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):  
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    
    print()

BAS19_ES
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

DYN21_EN
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 3000 training set
  saving n = 4000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

FOR19_PT
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 