In [84]:
import pandas as pd
import json
import numpy as np
import torch

In [104]:
from datasets import load_dataset

dataset = load_dataset("hatexplain")

Found cached dataset hatexplain (C:/Users/dange/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)


  0%|          | 0/3 [00:00<?, ?it/s]

In [258]:
def create_dataframe(dataset, split, max_len=200):
    dataset = dataset[split].to_dict()
    del dataset['id']
    num_examples = len(dataset['post_tokens'])
    print(f'{split} has {num_examples} examples')
    dataset['label'] = torch.zeros((num_examples, 3))
    label = []
    for i in range(num_examples):
        label.append(torch.Tensor(torch.Tensor(dataset['annotators'][i]['label']).type(torch.IntTensor)))
    label = torch.stack(label)
    label = label.mode().values
    dataset['label'][torch.arange(num_examples).type(torch.LongTensor), label.type(torch.LongTensor)] = 1
    dataset['class'] = label
    rationales = []
    for rationale in dataset['rationales']:
        if len(rationale) == 0:
            rationales.append(torch.zeros((max_len)))
            continue
        r = np.concatenate((
            np.array(rationale[0]), np.zeros((max_len - len(rationale[0])))
        )).astype(bool)
        for i in range(1, len(rationale)):
            r += np.concatenate((
                np.array(rationale[i]), np.zeros((max_len - len(rationale[i])))
            )).astype(bool)
        rationales.append(torch.tensor((r).astype(int)))
    dataset['rationales'] = torch.stack(rationales)
    return dataset

In [259]:
train = create_dataframe(dataset, 'train')
validation = create_dataframe(dataset, 'validation')
test = create_dataframe(dataset, 'test')

train has 15383 examples
validation has 1922 examples
test has 1924 examples


In [260]:
print(
    "train class split:- ", train['class'].bincount(), "\n",
    "validation class split:- ", validation['class'].bincount(), "\n",
    "test class split:- ", test['class'].bincount(),
)

train class split:-  tensor([4748, 6251, 4384]) 
 validation class split:-  tensor([593, 781, 548]) 
 test class split:-  tensor([594, 782, 548])


In [261]:
from sklearn.model_selection import StratifiedShuffleSplit
stratified_shuffle = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

In [285]:
def generate_language_splits(dataset, shuffler):
    shuffler.get_n_splits(dataset['post_tokens'], dataset['class'])
    for en_index, hi_index in shuffler.split(dataset['post_tokens'], dataset['class']):
        en_dataset = {}
        hi_dataset = {}

        en_dataset['index'] = en_index
        hi_dataset['index'] = hi_index

        en_dataset['label'] = torch.index_select(dataset['label'], 0, torch.tensor(en_index))
        hi_dataset['label'] = torch.index_select(dataset['label'], 0, torch.tensor(hi_index))

        en_dataset['rationales'] = torch.index_select(dataset['rationales'], 0, torch.tensor(en_index))
        hi_dataset['rationales'] = torch.index_select(dataset['rationales'], 0, torch.tensor(hi_index))

        en_dataset['class'] = torch.tensor([dataset['class'][i] for i in en_index])
        hi_dataset['class'] = torch.tensor([dataset['class'][i] for i in hi_index])

        en_dataset['post_tokens'] = [dataset['post_tokens'][i] for i in en_index]
        hi_dataset['post_tokens'] = [dataset['post_tokens'][i] for i in hi_index]

    return en_dataset, hi_dataset


In [286]:
en_train, hi_train = generate_language_splits(train, stratified_shuffle)
en_validation, hi_validation = generate_language_splits(validation, stratified_shuffle)
en_test, hi_test = generate_language_splits(test, stratified_shuffle)

In [278]:
print(
    "train class split:- ", hi_train['class'].bincount(), "\n",
    "validation class split:- ", hi_validation['class'].bincount(), "\n",
    "test class split:- ", hi_test['class'].bincount(),
)

train class split:-  tensor([475, 625, 439]) 
 validation class split:-  tensor([60, 78, 55]) 
 test class split:-  tensor([60, 78, 55])


In [313]:
transliterator = {}
transliterator['sentence'] = []
transliterator['index'] = []
transliterator['words'] = []
transliterator['type'] = []
for i in range(hi_train['class'].shape[0]):
    transliterator['type'].append('train')
    transliterator['sentence'].append(" ".join(hi_train['post_tokens'][i]))
    transliterator['index'].append(hi_train['index'][i])
    mask = hi_train['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_train['post_tokens'][i]) if i < 200 and mask[i]])
    )
for i in range(hi_validation['class'].shape[0]):
    transliterator['type'].append('validation')
    transliterator['sentence'].append(" ".join(hi_validation['post_tokens'][i]))
    transliterator['index'].append(hi_validation['index'][i])
    mask = hi_validation['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_validation['post_tokens'][i]) if i < 200 and mask[i]])
    )
for i in range(hi_test['class'].shape[0]):
    transliterator['type'].append('test')
    transliterator['sentence'].append(" ".join(hi_test['post_tokens'][i]))
    transliterator['index'].append(hi_test['index'][i])
    mask = hi_test['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_test['post_tokens'][i]) if i < 200 and mask[i]])
    )


In [317]:
df = pd.DataFrame(transliterator)
df.to_csv('to_translate.csv')