In [1]:
# Import packages
import os
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Datasets From Hugging Face

In [12]:
# https://huggingface.co/datasets

dataset_names = [
    'Paul/hatecheck-french',
    'hate_speech18',
    'hate_speech_offensive',
    'tweets_hate_speech_detection',
    'limjiayi/hateful_memes_expanded',
    'classla/FRENK-hate-en',
    'ucberkeley-dlab/measuring-hate-speech',
    # 'hatexplain'
]
# Output directory
output_dir = '../data/raw/'
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Output file extension
extension = 'tsv'

In [9]:
for dataset in dataset_names:
    print(f'Loading {dataset}...\n')
    # Loading dataset
    ds = load_dataset(dataset, trust_remote_code=True)
    # Converting to pandas dataframe
    ds.set_format(type='pandas')
    # Saving the different datasets split
    for key in ds.keys():
        df = ds[key][:]    
        name = dataset.replace('/', '_').replace('\\', '_')
        file_name = f'{name}_{key}.{extension}'
        file_path = os.path.join(output_dir, file_name)
        # Saving the data
        # with open(file_path, 'w', encoding='utf-8') as file:
        df.to_csv(file_path, sep='\t', index=False, encoding='utf-8')

Loading Paul/hatecheck-french...

Loading hate_speech18...



Generating train split: 100%|██████████| 10944/10944 [01:29<00:00, 122.29 examples/s]


Loading hate_speech_offensive...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 24783/24783 [00:00<00:00, 1180107.81 examples/s]


Loading tweets_hate_speech_detection...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 31962/31962 [00:00<00:00, 2130920.58 examples/s]
Generating test split: 100%|██████████| 17197/17197 [00:00<00:00, 1563341.41 examples/s]


Loading limjiayi/hateful_memes_expanded...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 12887/12887 [00:00<00:00, 1285850.12 examples/s]
Generating validation split: 100%|██████████| 1040/1040 [00:00<00:00, 90360.78 examples/s]
Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 249433.30 examples/s]


Loading classla/FRENK-hate-en...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 8404 examples [00:00, 43830.34 examples/s]
Generating validation split: 933 examples [00:00, 26657.26 examples/s]
Generating test split: 2301 examples [00:00, 38022.78 examples/s]


Loading ucberkeley-dlab/measuring-hate-speech...



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 135556/135556 [00:00<00:00, 936609.33 examples/s]


### Hatexplain needs particular preprocessing

In [11]:
dataset = 'hatexplain'
print(f'Loading {dataset}...\n')
# Loading dataset
ds = load_dataset(dataset, trust_remote_code=True)
# Converting to pandas dataframe
ds.set_format(type='pandas')
# Saving the different datasets split
for key in ds.keys():
    df = ds[key][:]
    df['post_tokens'] = df['post_tokens'].apply(lambda row: ' '.join(row))
    name = dataset.replace('/', '_').replace('\\', '_')
    file_name = f'{name}_{key}.{extension}'
    file_path = os.path.join(output_dir, file_name)
    # Saving the data
    # with open(file_path, 'w', encoding='utf-8') as file:
    df.to_csv(file_path, sep='\t', index=False, encoding='utf-8')

Loading hatexplain...



Downloading data: 100%|██████████| 12.3M/12.3M [00:03<00:00, 3.52MB/s]
Downloading data: 100%|██████████| 592k/592k [00:00<00:00, 5.50MB/s]
Generating train split: 100%|██████████| 15383/15383 [00:02<00:00, 7258.29 examples/s]
Generating validation split: 100%|██████████| 1922/1922 [00:00<00:00, 3244.06 examples/s]
Generating test split: 100%|██████████| 1924/1924 [00:00<00:00, 3142.88 examples/s]
