# Merging Datasets

This notebook details the process of merging multiple datasets into a single, unified dataset.  

In [1]:
import pandas as pd
import numpy as np
import dtale
from master_thesis.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR, load_dataframe_from_pickle, save_dataframe_as_pickle

[32m2025-03-15 18:06:56.862[0m | [1mINFO    [0m | [36mmaster_thesis.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /home/takosaga/Projects/master_thesis[0m


In [2]:
prep_hatexplain_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_hatexplain_df.pkl')
prep_measuring_hate_speech_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_measuring_hate_speech_df.pkl')
prep_mlma_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_mlma_df.pkl')

In [3]:
prep_hatexplain_df.dtypes

text                                              object
original_label                                    object
original_target                                   object
original_id                                       object
platform                                          object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [4]:
prep_measuring_hate_speech_df.dtypes

original_id                                         int32
text                                               object
original_label                                    float64
platform                                           object
original_target                                    object
original_dataset_title                             object
label_hatespeech_binary_offensive_not_included     object
label_hatespeech_binary_offensive_included         object
label_normal_offensive_hatespeech                  object
dtype: object

In [5]:
prep_measuring_hate_speech_df['original_id'] = prep_measuring_hate_speech_df['original_id'].astype('object')
prep_measuring_hate_speech_df['original_label'] = prep_measuring_hate_speech_df['original_label'].astype('object')

In [6]:
prep_measuring_hate_speech_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
platform                                          object
original_target                                   object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [7]:
prep_mlma_df.dtypes

original_id                                        int64
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [8]:
prep_mlma_df['original_id'] = prep_mlma_df['original_id'].astype('object')

In [9]:
prep_mlma_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [10]:
print('Hatexplain shape:',prep_hatexplain_df.shape)
print('MLMA shape: ', prep_mlma_df.shape)
print('Measuring Hate Speech shape: ', prep_measuring_hate_speech_df.shape)

Hatexplain shape: (20148, 9)
MLMA shape:  (5646, 9)
Measuring Hate Speech shape:  (39495, 9)


In [11]:
merged_dfs = pd.concat([prep_hatexplain_df, prep_measuring_hate_speech_df, prep_mlma_df], ignore_index=True)

In [12]:
merged_dfs.head()

Unnamed: 0,text,original_label,original_target,original_id,platform,original_dataset_title,label_hatespeech_binary_offensive_not_included,label_hatespeech_binary_offensive_included,label_normal_offensive_hatespeech
0,i dont think im getting my baby them white 9 h...,normal,[None],1179055004553900032,twitter,HateXplain,not_hatespeech,normal,normal
1,we cannot continue calling ourselves feminists...,normal,[None],1179063826874032128,twitter,HateXplain,not_hatespeech,normal,normal
2,nawt yall niggers ignoring me,normal,[African],1178793830532956161,twitter,HateXplain,not_hatespeech,normal,normal
3,<user> i am bit confused coz chinese ppl can n...,hatespeech,[Asian],1179088797964763136,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech
4,this bitch in whataburger eating a burger with...,hatespeech,"[Caucasian, Women]",1179085312976445440,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech


In [13]:
d = dtale.show(merged_dfs)
d.open_browser()

In [14]:
d.kill()

2025-03-15 18:06:57,401 - INFO     - Shutdown complete


In [15]:
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
nf_6           9
nf_2           8
nf_4           7
nf_3           2
nf_1           2
Name: count, dtype: int64

In [16]:
values_to_remove = ['nf_6', 'nf_2', 'nf_4', 'nf_3', 'nf_1']


In [17]:
merged_dfs = merged_dfs[~merged_dfs['platform'].isin(values_to_remove)]
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
Name: count, dtype: int64

In [18]:
merged_dfs['original_dataset_title'].value_counts()

original_dataset_title
Measuring Hate Speech    39495
HateXplain               20120
MLMA                      5646
Name: count, dtype: int64

In [19]:
merged_dfs['original_target'].value_counts()

original_target
[None]                                                                                                                                               6569
[women, gender]                                                                                                                                      6335
[black, race]                                                                                                                                        2580
[African]                                                                                                                                            2271
[specific_country, origin]                                                                                                                           2118
                                                                                                                                                     ... 
[latinx, other, race, immigrant, other, origin]             

I want to extract original targets to simiplify to calls out real or perceived “identity factors” of an individual or a group, including: “religion, ethnicity, nationality, race, colour, descent, gender, but also characteristics such as language, economic or social origin, disability, health status, or sexual orientation, among many others.

In [20]:
# to get all the unique values in orginal_target
flattened_list = [item for sublist in merged_dfs['original_target'] for item in sublist if item]
flattened_list = [item.lower() for item in flattened_list]
all_targets = list(set(flattened_list))

In [21]:
all_targets

['sexual_orientation',
 'buddhist',
 'race',
 'teenagers',
 'migrant_worker',
 'african_descent',
 'jewish',
 'immigrant',
 'transgender_men',
 'mormon',
 'muslims',
 'caucasian',
 'muslim',
 'indian',
 'homosexual',
 'christian',
 'men',
 'economic',
 'cognitive',
 'immigrants',
 'age',
 'arabs',
 'asian',
 'atheist',
 'none',
 'gay',
 'other',
 'gender',
 'indigenous',
 'neurological',
 'disability',
 'asians',
 'pacific_islander',
 'transgender_unspecified',
 'black',
 'undocumented',
 'white',
 'latinx',
 'visually_impaired',
 'unspecific',
 'hispanic',
 'bisexual',
 'african',
 'refugee',
 'special_needs',
 'hispanics',
 'arab',
 'middle_eastern',
 'young_adults',
 'women',
 'physical',
 'buddhism',
 'religion',
 'native_american',
 'hindu',
 'jews',
 'seniors',
 'non_binary',
 'sexuality',
 'individual',
 'transgender_women',
 'left_wing_people',
 'islam',
 'specific_country',
 'lesbian',
 'children',
 'hearing_impaired',
 'straight',
 'indian/hindu',
 'middle_aged',
 'origin',
 

In [22]:
"""
For all the targets from 
'target_race_asian',
'target_race_black',
'target_race_latinx',
'target_race_middle_eastern',
'target_race_native_american',
'target_race_pacific_islander',
'target_race_white',
'target_race_other',
'target_race',
'target_religion_atheist',
'target_religion_buddhist',
'target_religion_christian',
'target_religion_hindu',
'target_religion_jewish',
'target_religion_mormon',
'target_religion_muslim',
'target_religion_other',
'target_religion',
'target_origin_immigrant',
'target_origin_migrant_worker',
'target_origin_specific_country',
'target_origin_undocumented',
'target_origin_other',
'target_origin',
'target_gender_men',
'target_gender_non_binary',
'target_gender_transgender_men',
'target_gender_transgender_unspecified',
'target_gender_transgender_women',
'target_gender_women',
'target_gender_other',
'target_gender',
'target_sexuality_bisexual',
'target_sexuality_gay',
'target_sexuality_lesbian',
'target_sexuality_straight',
'target_sexuality_other',
'target_sexuality',
'target_age_children',
'target_age_teenagers',
'target_age_young_adults',
'target_age_middle_aged',
'target_age_seniors',
'target_age_other',
'target_age',
'target_disability_physical',
'target_disability_cognitive',
'target_disability_neurological',
'target_disability_visually_impaired',
'target_disability_hearing_impaired',
'target_disability_unspecific',
'target_disability_other',
'target_disability'
"""

"\nFor all the targets from \n'target_race_asian',\n'target_race_black',\n'target_race_latinx',\n'target_race_middle_eastern',\n'target_race_native_american',\n'target_race_pacific_islander',\n'target_race_white',\n'target_race_other',\n'target_race',\n'target_religion_atheist',\n'target_religion_buddhist',\n'target_religion_christian',\n'target_religion_hindu',\n'target_religion_jewish',\n'target_religion_mormon',\n'target_religion_muslim',\n'target_religion_other',\n'target_religion',\n'target_origin_immigrant',\n'target_origin_migrant_worker',\n'target_origin_specific_country',\n'target_origin_undocumented',\n'target_origin_other',\n'target_origin',\n'target_gender_men',\n'target_gender_non_binary',\n'target_gender_transgender_men',\n'target_gender_transgender_unspecified',\n'target_gender_transgender_women',\n'target_gender_women',\n'target_gender_other',\n'target_gender',\n'target_sexuality_bisexual',\n'target_sexuality_gay',\n'target_sexuality_lesbian',\n'target_sexuality_straigh

The following made from the limitations of hurtlex lexicon

In [23]:
race_labels = ['asian', 'black', 'latinx', 'middle_eastern', 
                'native_american', 'pacific_islander', 'white', 'race']
gender_sexuality_labels = ['gender','men','non_binary', 'transgender_men', 'transgender_unspecified',
                'transgender_women', 'women', 'sexuality', 'bisexual', 'gay', 'lesbian', 'straight']
disability_labels = ['disability','physical','cognitive','neurological','visually_impaired',
                'hearing_impaired','unspecific']
other_lables = ['religion', 'atheist', 'buddhist', 'christian', 'hindu', 'jewish', 'mormon',
                'muslim', 'immigrant', 'migrant_worker', 'specific_country', 'undocumented', 'origin',
                'age', 'children', 'teenagers', 'young_adults', 'middle_aged', 'seniors']

In [24]:
race = []
for label in race_labels:
    all_targets.remove(label)
    race.append(label)
    

In [25]:
gender_sexuality = []
for label in gender_sexuality_labels:
    all_targets.remove(label)
    gender_sexuality.append(label)
    


In [26]:
disability = []
for label in disability_labels:
    all_targets.remove(label)
    disability.append(label)


In [27]:

other = []
for label in other_lables:
    all_targets.remove(label)
    other.append(label)

In [28]:
all_targets

['sexual_orientation',
 'african_descent',
 'muslims',
 'caucasian',
 'indian',
 'homosexual',
 'economic',
 'immigrants',
 'arabs',
 'none',
 'other',
 'indigenous',
 'asians',
 'hispanic',
 'african',
 'refugee',
 'special_needs',
 'hispanics',
 'arab',
 'buddhism',
 'jews',
 'individual',
 'left_wing_people',
 'islam',
 'indian/hindu',
 'refugees']

In [29]:
more_race_labels = ['african_descent', 'indian/hindu', 'hispanics', 'african', 'arab', 'arabs', 'caucasian', 'hispanic', 'indian',
                    'asians']          

more_gender_sexuality_labels = ['homosexual', 'sexual_orientation']
more_disability_labels = ['special_needs']
more_other_labels = ['indigenous','economic', 'immigrants', 'refugee', 'left_wing_people','buddhism', 'other', 'jews', 'muslims', 
                    'individual', 'islam', 'refugees']

In [30]:
for label in more_race_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        race.append(label)
    else:
        print(f"{label} not found in all_targets")
        

removing african_descent from all_targets
removing indian/hindu from all_targets
removing hispanics from all_targets
removing african from all_targets
removing arab from all_targets
removing arabs from all_targets
removing caucasian from all_targets
removing hispanic from all_targets
removing indian from all_targets
removing asians from all_targets


In [31]:

for label in more_gender_sexuality_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        gender_sexuality.append(label)
    else:
        print(f"{label} not found in all_targets")


removing homosexual from all_targets
removing sexual_orientation from all_targets


In [32]:

for label in more_disability_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        disability.append(label)
    else:
        print(f"{label} not found in all_targets")


removing special_needs from all_targets


In [33]:

for label in more_other_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        other.append(label)
    else:
        print(f"{label} not found in all_targets")
        

removing indigenous from all_targets
removing economic from all_targets
removing immigrants from all_targets
removing refugee from all_targets
removing left_wing_people from all_targets
removing buddhism from all_targets
removing other from all_targets
removing jews from all_targets
removing muslims from all_targets
removing individual from all_targets
removing islam from all_targets
removing refugees from all_targets


In [34]:
all_targets

['none']

In [35]:
print(f"races: {race}")
print(f"gender_sexuality: {gender_sexuality}")
print(f"disability: {disability}")
print(f"other: {other}")

races: ['asian', 'black', 'latinx', 'middle_eastern', 'native_american', 'pacific_islander', 'white', 'race', 'african_descent', 'indian/hindu', 'hispanics', 'african', 'arab', 'arabs', 'caucasian', 'hispanic', 'indian', 'asians']
gender_sexuality: ['gender', 'men', 'non_binary', 'transgender_men', 'transgender_unspecified', 'transgender_women', 'women', 'sexuality', 'bisexual', 'gay', 'lesbian', 'straight', 'homosexual', 'sexual_orientation']
disability: ['disability', 'physical', 'cognitive', 'neurological', 'visually_impaired', 'hearing_impaired', 'unspecific', 'special_needs']
other: ['religion', 'atheist', 'buddhist', 'christian', 'hindu', 'jewish', 'mormon', 'muslim', 'immigrant', 'migrant_worker', 'specific_country', 'undocumented', 'origin', 'age', 'children', 'teenagers', 'young_adults', 'middle_aged', 'seniors', 'indigenous', 'economic', 'immigrants', 'refugee', 'left_wing_people', 'buddhism', 'other', 'jews', 'muslims', 'individual', 'islam', 'refugees']


In [36]:
def extract_target(target_list):

    
    result = set()
    for item in target_list:
        item_lower = item.lower()
        if item_lower in race:
            result.add('race')
        elif item_lower in gender_sexuality:
            result.add('gender_sexuality')
        elif item_lower in disability:
            result.add('disability')
        elif item_lower in other:
            result.add('other')
    
    return result if result else {'none'}

In [37]:
merged_dfs['extracted_target'] = merged_dfs['original_target'].apply(extract_target)


In [38]:
merged_dfs.head()

Unnamed: 0,text,original_label,original_target,original_id,platform,original_dataset_title,label_hatespeech_binary_offensive_not_included,label_hatespeech_binary_offensive_included,label_normal_offensive_hatespeech,extracted_target
0,i dont think im getting my baby them white 9 h...,normal,[None],1179055004553900032,twitter,HateXplain,not_hatespeech,normal,normal,{none}
1,we cannot continue calling ourselves feminists...,normal,[None],1179063826874032128,twitter,HateXplain,not_hatespeech,normal,normal,{none}
2,nawt yall niggers ignoring me,normal,[African],1178793830532956161,twitter,HateXplain,not_hatespeech,normal,normal,{race}
3,<user> i am bit confused coz chinese ppl can n...,hatespeech,[Asian],1179088797964763136,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech,{race}
4,this bitch in whataburger eating a burger with...,hatespeech,"[Caucasian, Women]",1179085312976445440,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech,"{gender_sexuality, race}"


In [39]:
merged_dfs['extracted_target'].value_counts()


extracted_target
{gender_sexuality}                             18319
{other}                                        15594
{race}                                          9555
{none}                                          7715
{other, race}                                   5171
{other, gender_sexuality}                       3017
{gender_sexuality, race}                        2172
{disability}                                    1486
{other, gender_sexuality, race}                  831
{disability, other}                              693
{disability, gender_sexuality}                   406
{disability, gender_sexuality, race, other}      108
{disability, race}                                61
{disability, gender_sexuality, other}             61
{disability, race, other}                         45
{disability, gender_sexuality, race}              27
Name: count, dtype: int64

In [40]:
merged_dfs['text'].value_counts()

text
<user> i hate you                                                                                                                                                                                                                                                                                                                                                                              14
<user> y are u focusing on that when nazis exist                                                                                                                                                                                                                                                                                                                                                4
<user> <number> bank islam                                                                                                                                                                                                     

In [41]:
merged_dfs = merged_dfs.drop_duplicates(subset=['text'])


In [42]:
merged_dfs['text'].value_counts()

text
i dont think im getting my baby them white 9 he has two white j and nikes not even touched                                                                                                                                                                          1
@soledadobrien Look at all the racist chanting. Photo snap them all and spread them around and see how they like that.                                                                                                                                              1
@AnotherBenHong Real cruelty! Real racism! URL                                                                                                                                                                                                                      1
@KatieDayXo You're brave! Sometimes I just can't stand the inevitable chatter that follows the truth. The best reaction I got was from a female lawyer who was describing business man culture and when I said I 

In [43]:
merged_dfs.shape

(65217, 10)

In [44]:
merged_dfs = merged_dfs.reset_index(drop=True)
merged_dfs['id_new'] = merged_dfs.index + 1

In [45]:
d = dtale.show(merged_dfs)
d.open_browser()

In [46]:
d.kill()

2025-03-15 18:06:59,162 - INFO     - Shutdown complete


In [47]:
merged_dfs.columns.to_list()

['text',
 'original_label',
 'original_target',
 'original_id',
 'platform',
 'original_dataset_title',
 'label_hatespeech_binary_offensive_not_included',
 'label_hatespeech_binary_offensive_included',
 'label_normal_offensive_hatespeech',
 'extracted_target',
 'id_new']

In [48]:
desired_order = ['id_new', 'text', 'extracted_target', 'label_hatespeech_binary_offensive_not_included',
                'label_hatespeech_binary_offensive_included','label_normal_offensive_hatespeech',
                'platform', 'original_dataset_title', 'original_id', 'original_label']
merged_dfs = merged_dfs[desired_order]

In [49]:
d = dtale.show(merged_dfs)
d.open_browser()

In [51]:
d.kill()

2025-03-15 18:07:14,386 - INFO     - Executing shutdown...
2025-03-15 18:07:14,387 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer


In [52]:
save_dataframe_as_pickle(merged_dfs, PROCESSED_DATA_DIR.as_posix() + '/annotated_and_targeted_hatespeech.pkl')



DataFrame saved to /home/takosaga/Projects/master_thesis/data/processed/annotated_and_targeted_hatespeech.pkl


In [54]:
merged_dfs.to_csv(PROCESSED_DATA_DIR.as_posix() + '/annotated_and_targeted_hatespeech.csv', index=False)