In [5]:
import pandas as pd
import json
import numpy as np
import torch
from tqdm.notebook import tqdm
from google.transliteration import transliterate_word

In [6]:
from datasets import load_dataset

dataset = load_dataset("hatexplain")

Using the latest cached version of the module from C:\Users\dange\.cache\huggingface\modules\datasets_modules\datasets\hatexplain\df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249 (last modified on Thu Nov 24 22:10:03 2022) since it couldn't be found locally at hatexplain., or remotely on the Hugging Face Hub.
Found cached dataset hatexplain (C:/Users/dange/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def create_dataframe(dataset, split, max_len=200):
    dataset = dataset[split].to_dict()
    del dataset['id']
    num_examples = len(dataset['post_tokens'])
    print(f'{split} has {num_examples} examples')
    dataset['label'] = torch.zeros((num_examples, 3))
    label = []
    for i in range(num_examples):
        label.append(torch.Tensor(torch.Tensor(dataset['annotators'][i]['label']).type(torch.IntTensor)))
    label = torch.stack(label)
    label = label.mode().values
    dataset['label'][torch.arange(num_examples).type(torch.LongTensor), label.type(torch.LongTensor)] = 1
    dataset['class'] = label
    rationales = []
    for rationale in dataset['rationales']:
        if len(rationale) == 0:
            rationales.append(torch.zeros((max_len)))
            continue
        r = np.concatenate((
            np.array(rationale[0]), np.zeros((max_len - len(rationale[0])))
        )).astype(bool)
        for i in range(1, len(rationale)):
            r += np.concatenate((
                np.array(rationale[i]), np.zeros((max_len - len(rationale[i])))
            )).astype(bool)
        rationales.append(torch.tensor((r).astype(int)))
    dataset['rationales'] = torch.stack(rationales)
    return dataset

In [8]:
train = create_dataframe(dataset, 'train')
validation = create_dataframe(dataset, 'validation')
test = create_dataframe(dataset, 'test')

train has 15383 examples
validation has 1922 examples
test has 1924 examples


In [9]:
print(
    "train class split:- ", train['class'].bincount(), "\n",
    "validation class split:- ", validation['class'].bincount(), "\n",
    "test class split:- ", test['class'].bincount(),
)

train class split:-  tensor([4748, 6251, 4384]) 
 validation class split:-  tensor([593, 781, 548]) 
 test class split:-  tensor([594, 782, 548])


In [10]:
sent_len = []
for sent in train['post_tokens']:
    sent_len.append(len(sent))
print(f'MAX LENGTH:- {max(sent_len)}\nAVG LENGTH:- {sum(sent_len)/len(sent_len)}')

MAX LENGTH:- 165
AVG LENGTH:- 23.465253851654424


In [11]:
from sklearn.model_selection import StratifiedShuffleSplit
stratified_shuffle = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [12]:
def generate_language_splits(dataset, shuffler):
    shuffler.get_n_splits(dataset['post_tokens'], dataset['class'])
    for en_index, hi_index in shuffler.split(dataset['post_tokens'], dataset['class']):
        en_dataset = {}
        hi_dataset = {}

        en_dataset['index'] = en_index
        hi_dataset['index'] = hi_index

        en_dataset['label'] = torch.index_select(dataset['label'], 0, torch.tensor(en_index))
        hi_dataset['label'] = torch.index_select(dataset['label'], 0, torch.tensor(hi_index))

        en_dataset['rationales'] = torch.index_select(dataset['rationales'], 0, torch.tensor(en_index))
        hi_dataset['rationales'] = torch.index_select(dataset['rationales'], 0, torch.tensor(hi_index))

        en_dataset['class'] = torch.tensor([dataset['class'][i] for i in en_index])
        hi_dataset['class'] = torch.tensor([dataset['class'][i] for i in hi_index])

        en_dataset['post_tokens'] = [dataset['post_tokens'][i] for i in en_index]
        hi_dataset['post_tokens'] = [dataset['post_tokens'][i] for i in hi_index]

    return en_dataset, hi_dataset


In [13]:
en_train, hi_train = generate_language_splits(train, stratified_shuffle)
en_validation, hi_validation = generate_language_splits(validation, stratified_shuffle)
en_test, hi_test = generate_language_splits(test, stratified_shuffle)

In [14]:
print(
    "train class split:- ", hi_train['class'].bincount(), "\n",
    "validation class split:- ", hi_validation['class'].bincount(), "\n",
    "test class split:- ", hi_test['class'].bincount(),
)

train class split:-  tensor([ 950, 1250,  877]) 
 validation class split:-  tensor([119, 156, 110]) 
 test class split:-  tensor([119, 156, 110])


In [15]:
transliterator = {}
transliterator['sentence'] = []
transliterator['index'] = []
transliterator['words'] = []
transliterator['type'] = []
for i in range(hi_train['class'].shape[0]):
    transliterator['type'].append('train')
    transliterator['sentence'].append(" ".join(hi_train['post_tokens'][i]))
    transliterator['index'].append(hi_train['index'][i])
    mask = hi_train['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_train['post_tokens'][i]) if i < 200 and mask[i]])
    )
for i in range(hi_validation['class'].shape[0]):
    transliterator['type'].append('validation')
    transliterator['sentence'].append(" ".join(hi_validation['post_tokens'][i]))
    transliterator['index'].append(hi_validation['index'][i])
    mask = hi_validation['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_validation['post_tokens'][i]) if i < 200 and mask[i]])
    )
for i in range(hi_test['class'].shape[0]):
    transliterator['type'].append('test')
    transliterator['sentence'].append(" ".join(hi_test['post_tokens'][i]))
    transliterator['index'].append(hi_test['index'][i])
    mask = hi_test['rationales'][i, :]
    transliterator['words'].append(
        " ".join([word for i, word in enumerate(hi_test['post_tokens'][i]) if i < 200 and mask[i]])
    )


In [16]:
df = pd.DataFrame(transliterator)
df.to_csv('to_translate.csv')

In [21]:
hindi_df = pd.read_csv('../translation/translated.csv', usecols=['sentence', 'hindi_sentence', 'words', 'hindi_words', 'index', 'type'])

In [22]:
display(hindi_df)

Unnamed: 0,sentence,index,words,type,hindi_sentence,hindi_words
0,mass unmarked moslem graves would,2980,mass unmarked moslem graves,train,बड़े पैमाने पर अचिह्नित मोस्लेम कब्रें,बड़े पैमाने पर अचिह्नित मोस्लेम कब्रें
1,<user> i see she will get inside the elevator ...,4992,,train,<user> मैं देख रहा हूं,#VALUE!
2,luckily the iranian people did not bite to the...,3962,scum rulers of the western world,train,सौभाग्य से ईरानी लोगों ने अपनी सरकार के खिलाफ ...,पश्चिमी दुनिया के स्कम शासक
3,it an incident you hear that incident if it wa...,10205,it an incident you hear that incident if it wa...,train,यह एक घटना है कि आप उस घटना को सुनते हैं यदि य...,यह एक घटना है कि आप उस घटना को सुनते हैं यदि य...
4,i just grow tired of seeing queer folx get can...,12567,,train,मैं बस क्वीर फोलक्स को दैनिक रूप से डिब्बाबंद ...,#VALUE!
...,...,...,...,...,...,...
1920,please address the issue of white kids quotati...,1892,white kids quotations for obvious reasons bein...,test,कृपया मुस्लिम ग्रूमिंग गैंग्स रॉदरहैम द्वारा प...,सफेद बच्चों के उद्धरण स्पष्ट कारणों के लिए पूर...
1921,it ’ payday bitches,1252,bitches,test,यह 'payday bitches,वो साले
1922,getting paid biweekly is so ghetto,1332,,test,"भुगतान किया जा रहा है, यहूदी बस्ती है",#VALUE!
1923,<user> big head and being a faggot in the fron...,1020,faggot,test,<उपयोगकर्ता> बड़ा सिर और सामने की पंक्ति में ए...,होमोसेक्सुअल


In [23]:
hindi_alphabets = [
    "क","ख","ग","घ","ङ","च","छ","ज","झ","ञ","ट","ठ","ड","ढ","ण","त","थ","द","ध","न","प","फ","ब","भ","म","य","र","ल","व","श","ष","स","ह","क्ष","त्र","ज्ञ"    
    ]
def normalize(input):
    input_type = type(input)
    if input_type == str:
        input = input.split()
    output = []
    for word in input:
        norm = []
        for char in list(word):
            if char in hindi_alphabets:
                norm.append(char)
        output.append("".join(norm))
    if input_type == str:
        return " ".join(output)
    return output

In [24]:
translated_dict = hindi_df.to_dict()
translated_dict.keys()

dict_keys(['sentence', 'index', 'words', 'type', 'hindi_sentence', 'hindi_words'])

In [25]:
hindi_sentences = hindi_df.hindi_sentence.values
hindi_words = hindi_df.hindi_words.values
english_words = hindi_df.words.values

In [45]:
hindi_rationales = []
transliterations = []
total_words = 0
matches_found = 0
sentences_matched = 0
MATCHED = False
for i, sentence in enumerate(tqdm(hindi_sentences)):
    sentence = sentence.split()
    rationale = torch.zeros(1, len(sentence))
    total_words += len(hindi_words[i])
    if hindi_words[i] == '#VALUE!':
        hindi_rationales.append(rationale)
        sentences_matched += 1
        continue
    transliterated_words = []
    if type(hindi_words[i]) != list: 
        hindi_words[i] = hindi_words[i].split() 
    if type(english_words[i]) != list: 
        english_words[i] = english_words[i].split()
        for word in english_words[i]:
            transliterated_words.extend(transliterate_word(word, lang_code='hi'))
    transliterations.append(transliterated_words)
    normalized_hindi_words = normalize(hindi_words[i])
    normalized_trans_words = normalize(transliterated_words)
    for j, word in enumerate(sentence):
        if word in hindi_words[i] or word in english_words[i] or word in transliterated_words:
            matches_found += 1
            rationale[0,j] = 1
            if not MATCHED:
                sentences_matched += 1
                MATCHED = True
        elif normalize(word) in normalized_hindi_words or normalize(word) in normalized_trans_words:
            matches_found += 1
            rationale[0, j] = 1
            if not MATCHED:
                sentences_matched += 1
                MATCHED = True
    if not MATCHED:
        hindi_rationales.append(None)
    else:
        hindi_rationales.append(rationale)
    MATCHED = False
    if i == 500:
        break
print(f'{matches_found} words mapped out of {total_words}\n{sentences_matched} rationales found out of {hindi_df.shape[0]}')


  0%|          | 0/1925 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [46]:
print(transliterations)

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],

In [47]:
import pickle
pickle.dump(transliterations, open('transliterations.p', 'wb'))

In [48]:
pickle.load(open('transliterations.p', 'rb'))

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],


In [27]:
if 'hindi_rationales' in hindi_df.columns:
    print("cleaning . . . ")
    hindi_df.drop(columns=['hindi_rationales'])
    hindi_df.dropna()
hindi_df = pd.concat([hindi_df, pd.DataFrame(hindi_rationales, columns=['hindi_rationales'])], axis=1)
display(hindi_df)

Unnamed: 0,sentence,index,words,type,hindi_sentence,hindi_words,hindi_rationales
0,mass unmarked moslem graves would,2980,"[mass, unmarked, moslem, graves]",train,बड़े पैमाने पर अचिह्नित मोस्लेम कब्रें,"[बड़े, पैमाने, पर, अचिह्नित, मोस्लेम, कब्रें]",
1,<user> i see she will get inside the elevator ...,4992,,train,<user> मैं देख रहा हूं,#VALUE!,
2,luckily the iranian people did not bite to the...,3962,scum rulers of the western world,train,सौभाग्य से ईरानी लोगों ने अपनी सरकार के खिलाफ ...,पश्चिमी दुनिया के स्कम शासक,
3,it an incident you hear that incident if it wa...,10205,it an incident you hear that incident if it wa...,train,यह एक घटना है कि आप उस घटना को सुनते हैं यदि य...,यह एक घटना है कि आप उस घटना को सुनते हैं यदि य...,
4,i just grow tired of seeing queer folx get can...,12567,,train,मैं बस क्वीर फोलक्स को दैनिक रूप से डिब्बाबंद ...,#VALUE!,
...,...,...,...,...,...,...,...
1920,please address the issue of white kids quotati...,1892,white kids quotations for obvious reasons bein...,test,कृपया मुस्लिम ग्रूमिंग गैंग्स रॉदरहैम द्वारा प...,सफेद बच्चों के उद्धरण स्पष्ट कारणों के लिए पूर...,
1921,it ’ payday bitches,1252,bitches,test,यह 'payday bitches,वो साले,
1922,getting paid biweekly is so ghetto,1332,,test,"भुगतान किया जा रहा है, यहूदी बस्ती है",#VALUE!,
1923,<user> big head and being a faggot in the fron...,1020,faggot,test,<उपयोगकर्ता> बड़ा सिर और सामने की पंक्ति में ए...,होमोसेक्सुअल,


In [344]:
def append_to_english(en_dict, index, label, rationale, output, post_tokens):
    en_dict['index'] = np.concatenate((en_dict['index'], np.array([index])))
    en_dict['label'] = np.concatenate((en_dict['label'], label.reshape(1,3)))
    en_dict['rationales'] = np.concatenate((en_dict['rationales'], rationale.reshape(1, 200)))
    en_dict['class'] = np.concatenate((en_dict['class'], np.array([output])))
    en_dict['post_tokens'].append(post_tokens)

In [345]:
en_train.keys()

dict_keys(['index', 'label', 'rationales', 'class', 'post_tokens'])

In [346]:
def generate_final(translated_dict, en_dict, hi_dict, hindi_df):
    for i, row in enumerate(hindi_df.iterrows()):
        _, row = row
        if row[-1] == None:
            append_to_english(en_dict, row[1], hi_dict['label'][i], hi_dict['rationales'][i], hi_dict['class'][i], hi_dict['post_tokens'][i])
        else:
            translated_dict['index'].append(row[1])
            translated_dict['label'].append(hi_dict['label'][i])
            translated_dict['rationales'].append(row[-1])
            translated_dict['class'].append(hi_dict['class'][i])
            translated_dict['post_tokens'].append(row[4].split(" "))
    translated_dict['label'] = torch.stack(translated_dict['label'])
    translated_dict['class'] = torch.stack(translated_dict['class'])
    rationales = []
    for rationale in translated_dict['rationales']:
        r = np.concatenate(
            (rationale, 
                np.zeros((1, 200 - rationale.shape[1]))
            ), axis=1).astype(bool)
        rationales.append(torch.tensor((r).astype(int)))
    translated_dict['rationales'] = torch.stack(rationales)

In [347]:
translated_hi_train = {}
translated_hi_validation = {}
translated_hi_test = {}

for key in en_train.keys():
    translated_hi_train[key] = []
    translated_hi_validation[key] = []
    translated_hi_test[key] = []

groups = hindi_df.groupby('type')
generate_final(translated_hi_train, en_train, hi_train, groups.get_group('train'))
generate_final(translated_hi_validation, en_validation, hi_validation, groups.get_group('validation'))
generate_final(translated_hi_test, en_test, hi_test, groups.get_group('test'))

In [348]:
print(
    "train class split:- ", translated_hi_train['class'].bincount(), "\n",
    "validation class split:- ", translated_hi_validation['class'].bincount(), "\n",
    "test class split:- ", translated_hi_test['class'].bincount(),
)

train class split:-  tensor([ 879, 1250,  816]) 
 validation class split:-  tensor([114, 156, 104]) 
 test class split:-  tensor([110, 156,  96])


In [349]:
print(
    "train class split:- ", torch.tensor(en_train['class']).bincount(), "\n",
    "validation class split:- ", torch.tensor(en_validation['class']).bincount(), "\n",
    "test class split:- ", torch.tensor(en_test['class']).bincount(),
)

train class split:-  tensor([3869, 5001, 3568]) 
 validation class split:-  tensor([479, 625, 444]) 
 test class split:-  tensor([484, 626, 452])


In [350]:
print(
    torch.tensor(en_train['class']).bincount() + translated_hi_train['class'].bincount(), 
torch.tensor(en_validation['class']).bincount() + translated_hi_validation['class'].bincount(), 
torch.tensor(en_test['class']).bincount() + translated_hi_test['class'].bincount()
)

tensor([4748, 6251, 4384]) tensor([593, 781, 548]) tensor([594, 782, 548])


In [327]:
print(
    "train class split:- ", train['class'].bincount(), "\n",
    "validation class split:- ", validation['class'].bincount(), "\n",
    "test class split:- ", test['class'].bincount(),
)

train class split:-  tensor([4748, 6251, 4384]) 
 validation class split:-  tensor([593, 781, 548]) 
 test class split:-  tensor([594, 782, 548])


In [359]:
import pickle
pickle.dump(en_train, open('../data/en_train.p', 'wb'))
pickle.dump(en_validation, open('../data/en_validation.p', 'wb'))
pickle.dump(en_test, open('../data/en_test.p', 'wb'))

pickle.dump(translated_hi_train, open('../data/hi_train.p', 'wb'))
pickle.dump(translated_hi_validation, open('../data/hi_validation.p', 'wb'))
pickle.dump(translated_hi_test, open('../data/hi_test.p', 'wb'))

In [4]:
import pickle
hi_train = pickle.load(open('../data/hi_test.p', 'rb'))
hi_train['post_tokens']

['there is no logical or just reason for a mediocre few shielded from competition propped up by inherited riches and passports to feast on the world riches under the guise of meritocracy <user> ungrateful refugee <number>',
 'dumb shitskin prefers to ignore numbers they can not calculate typical',
 'can you guys say nigger without getting arrested lol',
 'teen choice blackpink choice international artist <user>',
 'really alex jones believes that the arabs in israel need to return to jordan where they actually came from because that my two state solution israel and jordan done and done',
 'there is a link on washington examiner that suggests the dws it paki it aide has terrorist ties but the link will not post why washingtonexaminer com ron desantis fired debbie wasserman schultz aide may have terrorist ties article <number>',
 'antifa is currently burning everything in sight and beating people up stand down someone is shooting this school up stand down there are a bunch of illegal imm

['सौभाग्य से यह Kike नाजी शासन के तहत कभी नहीं रहा और निश्चित रूप से कम्युनिस्ट नियम के तहत कभी नहीं रहता था',
 'ओह यार जो कि सैनिकों के लिए एक कमबख्त दुखद फोटो है, जो कि सैंड निगर देशों में उन्हें नुकसान पहुंचाने वाले गधे के लिए बहुत ज्यादा नहीं है।',
 'मिस लिंडसे जॉन मैककेन्स बॉय टॉय मिस लिंडसे एक डिमेंटेड फगोट है',
 'एशियाई लोग खुशी के लायक नहीं हैं',
 '<user> सिख समुदाय हमेशा दूसरों के लिए सहायक रूप से उपयोगी है, विशेष रूप से वे भारत में मुसलमानों की मदद करते हैं',
 'यह इस प्रमुख कारक को संबोधित करता है कि हमें यह क्यों कहना चाहिए और आप्रवासियों को हमारी वर्तमान संस्कृति में आत्मसात करने से इनकार करने की समस्या क्या है अगर वे इस बात को आत्मसात नहीं करना चाहते हैं कि बकवास क्यों भी यहां आते हैं तो वे इसे हमसे मुक्त करने के लिए करते हैं जो हम से मुक्त करने के लिए करते हैं भुगतान करना',
 'यह गड़बड़ है कि उसे इसके बजाय प्रवासी यहूदी बस्ती के निवास पर बमबारी करनी चाहिए थी',
 'देखिए कि उन्हें एक राष्ट्रीय स्तर पर अपमानित हंसी का स्टॉक मिलेगा, जो एक वरिष्ठ स्तर पर काम करता था, वह एक उदार 