# Merging Datasets

This notebook details the process of merging multiple datasets into a single, unified dataset.  

In [1]:
import pandas as pd
import numpy as np
import dtale
from master_thesis.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR, load_dataframe_from_pickle, save_dataframe_as_pickle

[32m2025-04-01 14:55:44.446[0m | [1mINFO    [0m | [36mmaster_thesis.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /home/takosaga/Projects/master_thesis[0m


In [2]:
prep_hatexplain_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_hatexplain_df.pkl')
prep_measuring_hate_speech_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_measuring_hate_speech_df.pkl')
prep_mlma_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_mlma_df.pkl')

In [3]:
prep_hatexplain_df.dtypes

text                                              object
original_label                                    object
original_target                                   object
original_id                                       object
platform                                          object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [4]:
prep_measuring_hate_speech_df.dtypes

original_id                                         int32
text                                               object
original_label                                    float64
platform                                           object
original_target                                    object
original_dataset_title                             object
label_hatespeech_binary_offensive_not_included     object
label_hatespeech_binary_offensive_included         object
label_normal_offensive_hatespeech                  object
dtype: object

In [5]:
prep_measuring_hate_speech_df['original_id'] = prep_measuring_hate_speech_df['original_id'].astype('object')
prep_measuring_hate_speech_df['original_label'] = prep_measuring_hate_speech_df['original_label'].astype('object')

In [6]:
prep_measuring_hate_speech_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
platform                                          object
original_target                                   object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [7]:
prep_mlma_df.dtypes

original_id                                        int64
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [8]:
prep_mlma_df['original_id'] = prep_mlma_df['original_id'].astype('object')

In [9]:
prep_mlma_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [10]:
print('Hatexplain shape:',prep_hatexplain_df.shape)
print('MLMA shape: ', prep_mlma_df.shape)
print('Measuring Hate Speech shape: ', prep_measuring_hate_speech_df.shape)

Hatexplain shape: (20148, 9)
MLMA shape:  (5646, 9)
Measuring Hate Speech shape:  (39495, 9)


In [11]:
merged_dfs = pd.concat([prep_hatexplain_df, prep_measuring_hate_speech_df, prep_mlma_df], ignore_index=True)

In [12]:
merged_dfs.head()

Unnamed: 0,text,original_label,original_target,original_id,platform,original_dataset_title,label_hatespeech_binary_offensive_not_included,label_hatespeech_binary_offensive_included,label_normal_offensive_hatespeech
0,i dont think im getting my baby them white 9 h...,normal,[None],1179055004553900032,twitter,HateXplain,not_hatespeech,normal,normal
1,we cannot continue calling ourselves feminists...,normal,[None],1179063826874032128,twitter,HateXplain,not_hatespeech,normal,normal
2,nawt yall niggers ignoring me,normal,[African],1178793830532956161,twitter,HateXplain,not_hatespeech,normal,normal
3,<user> i am bit confused coz chinese ppl can n...,hatespeech,[Asian],1179088797964763136,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech
4,this bitch in whataburger eating a burger with...,hatespeech,"[Caucasian, Women]",1179085312976445440,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech


In [13]:
d = dtale.show(merged_dfs)
d.open_browser()

In [14]:
d.kill()

2025-04-01 14:55:45,060 - INFO     - Shutdown complete


In [15]:
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
nf_6           9
nf_2           8
nf_4           7
nf_3           2
nf_1           2
Name: count, dtype: int64

In [16]:
values_to_remove = ['nf_6', 'nf_2', 'nf_4', 'nf_3', 'nf_1']


In [17]:
merged_dfs = merged_dfs[~merged_dfs['platform'].isin(values_to_remove)]
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
Name: count, dtype: int64

In [18]:
merged_dfs['original_dataset_title'].value_counts()

original_dataset_title
Measuring Hate Speech    39495
HateXplain               20120
MLMA                      5646
Name: count, dtype: int64

In [19]:
merged_dfs['original_target'].value_counts()

original_target
[None]                                                                                                                                                                                                                                                   6569
[gender_women, gender]                                                                                                                                                                                                                                   6335
[race_black, race]                                                                                                                                                                                                                                       2580
[African]                                                                                                                                                                                                                     

I want to extract original targets to simiplify to calls out real or perceived “identity factors” of an individual or a group, including: “religion, ethnicity, nationality, race, colour, descent, gender, but also characteristics such as language, economic or social origin, disability, health status, or sexual orientation, among many others.

In [20]:
# to get all the unique values in orginal_target
flattened_list = [item for sublist in merged_dfs['original_target'] for item in sublist if item]
flattened_list = [item.lower() for item in flattened_list]
all_targets = list(set(flattened_list))

In [21]:
all_targets

['hispanic',
 'origin_migrant_worker',
 'arabs',
 'religion_hindu',
 'origin_undocumented',
 'origin_other',
 'homosexual',
 'religion',
 'age_other',
 'race_latinx',
 'origin_immigrant',
 'religion_buddhist',
 'other',
 'refugee',
 'religion_christian',
 'disability_visually_impaired',
 'gender_transgender_women',
 'african_descent',
 'religion_mormon',
 'disability_unspecific',
 'disability_hearing_impaired',
 'disability',
 'indian',
 'sexuality_bisexual',
 'sexuality',
 'race_middle_eastern',
 'sexuality_straight',
 'religion_muslim',
 'special_needs',
 'caucasian',
 'age_young_adults',
 'sexuality_other',
 'indian/hindu',
 'race_pacific_islander',
 'sexual_orientation',
 'men',
 'none',
 'gay',
 'gender_non_binary',
 'race_white',
 'sexuality_lesbian',
 'hispanics',
 'disability_cognitive',
 'muslims',
 'disability_other',
 'age_teenagers',
 'age_seniors',
 'origin_specific_country',
 'left_wing_people',
 'hindu',
 'jewish',
 'sexuality_gay',
 'economic',
 'islam',
 'individual',


In [22]:
"""
For all the targets from 
'target_race_asian',
'target_race_black',
'target_race_latinx',
'target_race_middle_eastern',
'target_race_native_american',
'target_race_pacific_islander',
'target_race_white',
'target_race_other',
'target_race',
'target_religion_atheist',
'target_religion_buddhist',
'target_religion_christian',
'target_religion_hindu',
'target_religion_jewish',
'target_religion_mormon',
'target_religion_muslim',
'target_religion_other',
'target_religion',
'target_origin_immigrant',
'target_origin_migrant_worker',
'target_origin_specific_country',
'target_origin_undocumented',
'target_origin_other',
'target_origin',
'target_gender_men',
'target_gender_non_binary',
'target_gender_transgender_men',
'target_gender_transgender_unspecified',
'target_gender_transgender_women',
'target_gender_women',
'target_gender_other',
'target_gender',
'target_sexuality_bisexual',
'target_sexuality_gay',
'target_sexuality_lesbian',
'target_sexuality_straight',
'target_sexuality_other',
'target_sexuality',
'target_age_children',
'target_age_teenagers',
'target_age_young_adults',
'target_age_middle_aged',
'target_age_seniors',
'target_age_other',
'target_age',
'target_disability_physical',
'target_disability_cognitive',
'target_disability_neurological',
'target_disability_visually_impaired',
'target_disability_hearing_impaired',
'target_disability_unspecific',
'target_disability_other',
'target_disability'
"""

"\nFor all the targets from \n'target_race_asian',\n'target_race_black',\n'target_race_latinx',\n'target_race_middle_eastern',\n'target_race_native_american',\n'target_race_pacific_islander',\n'target_race_white',\n'target_race_other',\n'target_race',\n'target_religion_atheist',\n'target_religion_buddhist',\n'target_religion_christian',\n'target_religion_hindu',\n'target_religion_jewish',\n'target_religion_mormon',\n'target_religion_muslim',\n'target_religion_other',\n'target_religion',\n'target_origin_immigrant',\n'target_origin_migrant_worker',\n'target_origin_specific_country',\n'target_origin_undocumented',\n'target_origin_other',\n'target_origin',\n'target_gender_men',\n'target_gender_non_binary',\n'target_gender_transgender_men',\n'target_gender_transgender_unspecified',\n'target_gender_transgender_women',\n'target_gender_women',\n'target_gender_other',\n'target_gender',\n'target_sexuality_bisexual',\n'target_sexuality_gay',\n'target_sexuality_lesbian',\n'target_sexuality_straigh

Going based of United nations Hate speech calls out real or perceived “identity factors” of an individual or a group, including: “religion, ethnicity, nationality, race, colour, descent, gender,” but also characteristics such as language, economic or social origin, disability, health status, or sexual orientation, among many others.

In [23]:
race_labels =  []
religion_labels = []
nationality_labels = [] # start with origin
gender_labels = []
sexuality_labels = []
disability_labels = []
other_lables = [] # age

In [24]:
for target in all_targets:
    if target.startswith('race'):
        race_labels.append(target)
        if target.startswith('race_'):
            race_labels.append(target[5:])

    if target.startswith('religion'):
        religion_labels.append(target)
        if target.startswith('religion_'):
            religion_labels.append(target[9:])

    if target.startswith('origin'):
        nationality_labels.append(target)
        if target.startswith('origin_'):
            nationality_labels.append(target[7:])

    if target.startswith('gender'):
        gender_labels.append(target)
        if target.startswith('gender_'):
            gender_labels.append(target[7:])

    if target.startswith('sexuality'):
        sexuality_labels.append(target)
        if target.startswith('sexuality_'):
            sexuality_labels.append(target[10:])


    if target.startswith('disability'):
        disability_labels.append(target)
        if target.startswith('disability_'):
            disability_labels.append(target[11:])

    if target.startswith('age'):
        other_lables.append(target)
        if target.startswith('age_'):
            other_lables.append(target[4:])

Sanity check of seeing labels and removing singular other

In [25]:
race_labels.remove('other')
race_labels

['race_latinx',
 'latinx',
 'race_middle_eastern',
 'middle_eastern',
 'race_pacific_islander',
 'pacific_islander',
 'race_white',
 'white',
 'race',
 'race_other',
 'race_black',
 'black',
 'race_asian',
 'asian',
 'race_native_american',
 'native_american']

In [26]:
religion_labels.remove('other')
religion_labels

['religion_hindu',
 'hindu',
 'religion',
 'religion_buddhist',
 'buddhist',
 'religion_christian',
 'christian',
 'religion_mormon',
 'mormon',
 'religion_muslim',
 'muslim',
 'religion_jewish',
 'jewish',
 'religion_atheist',
 'atheist',
 'religion_other']

In [27]:
nationality_labels.remove('other')
nationality_labels

['origin_migrant_worker',
 'migrant_worker',
 'origin_undocumented',
 'undocumented',
 'origin_other',
 'origin_immigrant',
 'immigrant',
 'origin_specific_country',
 'specific_country',
 'origin']

In [28]:
sexuality_labels.remove('other')
sexuality_labels

['sexuality_bisexual',
 'bisexual',
 'sexuality',
 'sexuality_straight',
 'straight',
 'sexuality_other',
 'sexuality_lesbian',
 'lesbian',
 'sexuality_gay',
 'gay']

In [29]:
gender_labels.remove('other')
gender_labels

['gender_transgender_women',
 'transgender_women',
 'gender_non_binary',
 'non_binary',
 'gender_transgender_men',
 'transgender_men',
 'gender_transgender_unspecified',
 'transgender_unspecified',
 'gender_other',
 'gender',
 'gender_men',
 'men',
 'gender_women',
 'women']

In [30]:
disability_labels.remove('other')
disability_labels

['disability_visually_impaired',
 'visually_impaired',
 'disability_unspecific',
 'unspecific',
 'disability_hearing_impaired',
 'hearing_impaired',
 'disability',
 'disability_cognitive',
 'cognitive',
 'disability_other',
 'disability_physical',
 'physical',
 'disability_neurological',
 'neurological']

In [31]:
other_lables

['age_other',
 'other',
 'age_young_adults',
 'young_adults',
 'age_teenagers',
 'teenagers',
 'age_seniors',
 'seniors',
 'age_middle_aged',
 'middle_aged',
 'age_children',
 'children',
 'age']

In [32]:
race = []
for label in race_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        race.append(label)
    

removing race_latinx from all_targets
removing race_middle_eastern from all_targets
removing race_pacific_islander from all_targets
removing race_white from all_targets
removing race from all_targets
removing race_other from all_targets
removing race_black from all_targets
removing race_asian from all_targets
removing asian from all_targets
removing race_native_american from all_targets


In [33]:
religion = []
for label in religion_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        religion.append(label)

removing religion_hindu from all_targets
removing hindu from all_targets
removing religion from all_targets
removing religion_buddhist from all_targets
removing religion_christian from all_targets
removing christian from all_targets
removing religion_mormon from all_targets
removing religion_muslim from all_targets
removing religion_jewish from all_targets
removing jewish from all_targets
removing religion_atheist from all_targets
removing religion_other from all_targets


In [34]:
nationality = []
for label in nationality_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        nationality.append(label)

removing origin_migrant_worker from all_targets
removing origin_undocumented from all_targets
removing origin_other from all_targets
removing origin_immigrant from all_targets
removing origin_specific_country from all_targets
removing origin from all_targets


In [35]:
gender = []
for label in gender_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        gender.append(label)

removing gender_transgender_women from all_targets
removing gender_non_binary from all_targets
removing gender_transgender_men from all_targets
removing gender_transgender_unspecified from all_targets
removing gender_other from all_targets
removing gender from all_targets
removing gender_men from all_targets
removing men from all_targets
removing gender_women from all_targets
removing women from all_targets


In [36]:
sexuality = []
for label in sexuality_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        sexuality.append(label)
    


removing sexuality_bisexual from all_targets
removing sexuality from all_targets
removing sexuality_straight from all_targets
removing sexuality_other from all_targets
removing sexuality_lesbian from all_targets
removing sexuality_gay from all_targets
removing gay from all_targets


In [37]:
disability = []
for label in disability_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        disability.append(label)


removing disability_visually_impaired from all_targets
removing disability_unspecific from all_targets
removing disability_hearing_impaired from all_targets
removing disability from all_targets
removing disability_cognitive from all_targets
removing disability_other from all_targets
removing disability_physical from all_targets
removing disability_neurological from all_targets


In [38]:

other = []
for label in other_lables:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        other.append(label)

removing age_other from all_targets
removing other from all_targets
removing age_young_adults from all_targets
removing age_teenagers from all_targets
removing age_seniors from all_targets
removing age_middle_aged from all_targets
removing age_children from all_targets
removing age from all_targets


In [39]:
all_targets

['hispanic',
 'arabs',
 'homosexual',
 'refugee',
 'african_descent',
 'indian',
 'special_needs',
 'caucasian',
 'indian/hindu',
 'sexual_orientation',
 'none',
 'hispanics',
 'muslims',
 'left_wing_people',
 'economic',
 'islam',
 'individual',
 'arab',
 'refugees',
 'immigrants',
 'buddhism',
 'asians',
 'indigenous',
 'african',
 'jews']

In [40]:
more_race_labels = ['african', 'asians','caucasian','hispanic','indigenous','hispanics','african_descent','indian/hindu']    
more_religion_labels = ['muslims', 'jews', 'islam', 'buddhism']
more_nationality_labels = ['arab', 'indian','refugee','immigrants','arabs','refugees']
more_sexuality_labels = ['homosexual','sexual_orientation']
more_disability_labels = ['special_needs']
more_other_labels = ['economic','left_wing_people','individual']

In [41]:
for label in more_race_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        race.append(label)
    else:
        print(f"{label} not found in all_targets")
        

removing african from all_targets
removing asians from all_targets
removing caucasian from all_targets
removing hispanic from all_targets
removing indigenous from all_targets
removing hispanics from all_targets
removing african_descent from all_targets
removing indian/hindu from all_targets


In [42]:
for label in more_religion_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        religion.append(label)
    else:
        print(f"{label} not found in all_targets")

removing muslims from all_targets
removing jews from all_targets
removing islam from all_targets
removing buddhism from all_targets


In [43]:
for label in more_nationality_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        nationality.append(label)
    else:
        print(f"{label} not found in all_targets")

removing arab from all_targets
removing indian from all_targets
removing refugee from all_targets
removing immigrants from all_targets
removing arabs from all_targets
removing refugees from all_targets


In [44]:
for label in more_sexuality_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        sexuality.append(label)
    else:
        print(f"{label} not found in all_targets")

removing homosexual from all_targets
removing sexual_orientation from all_targets


In [45]:

for label in more_disability_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        disability.append(label)
    else:
        print(f"{label} not found in all_targets")


removing special_needs from all_targets


In [46]:

for label in more_other_labels:
    if label in all_targets:
        print(f"removing {label} from all_targets")
        all_targets.remove(label)
        other.append(label)
    else:
        print(f"{label} not found in all_targets")
        

removing economic from all_targets
removing left_wing_people from all_targets
removing individual from all_targets


In [47]:
all_targets

['none']

In [48]:
print(f"races: {race}")
print(f"religions: {religion}")
print(f"nationalities: {nationality}")
print(f"gender: {gender}")
print(f'sexuality: {sexuality}')
print(f"disability: {disability}")
print(f"other: {other}")

races: ['race_latinx', 'race_middle_eastern', 'race_pacific_islander', 'race_white', 'race', 'race_other', 'race_black', 'race_asian', 'asian', 'race_native_american', 'african', 'asians', 'caucasian', 'hispanic', 'indigenous', 'hispanics', 'african_descent', 'indian/hindu']
religions: ['religion_hindu', 'hindu', 'religion', 'religion_buddhist', 'religion_christian', 'christian', 'religion_mormon', 'religion_muslim', 'religion_jewish', 'jewish', 'religion_atheist', 'religion_other', 'muslims', 'jews', 'islam', 'buddhism']
nationalities: ['origin_migrant_worker', 'origin_undocumented', 'origin_other', 'origin_immigrant', 'origin_specific_country', 'origin', 'arab', 'indian', 'refugee', 'immigrants', 'arabs', 'refugees']
gender: ['gender_transgender_women', 'gender_non_binary', 'gender_transgender_men', 'gender_transgender_unspecified', 'gender_other', 'gender', 'gender_men', 'men', 'gender_women', 'women']
sexuality: ['sexuality_bisexual', 'sexuality', 'sexuality_straight', 'sexuality_o

In [49]:
def extract_target(target_list):

    
    result = set()
    for item in target_list:
        item_lower = item.lower()
        if item_lower in race:
            result.add('race')
        elif item_lower in religion:
            result.add('religion')
        elif item_lower in nationality:
            result.add('nationality')
        elif item_lower in gender:
            result.add('gender')
        elif item_lower in sexuality:
            result.add('sexuality')
        elif item_lower in disability:
            result.add('disability')
        elif item_lower in other:
            result.add('other')
    
    return result if result else {'none'}

In [50]:
merged_dfs['extracted_target'] = merged_dfs['original_target'].apply(extract_target)


In [51]:
merged_dfs.head()

Unnamed: 0,text,original_label,original_target,original_id,platform,original_dataset_title,label_hatespeech_binary_offensive_not_included,label_hatespeech_binary_offensive_included,label_normal_offensive_hatespeech,extracted_target
0,i dont think im getting my baby them white 9 h...,normal,[None],1179055004553900032,twitter,HateXplain,not_hatespeech,normal,normal,{none}
1,we cannot continue calling ourselves feminists...,normal,[None],1179063826874032128,twitter,HateXplain,not_hatespeech,normal,normal,{none}
2,nawt yall niggers ignoring me,normal,[African],1178793830532956161,twitter,HateXplain,not_hatespeech,normal,normal,{race}
3,<user> i am bit confused coz chinese ppl can n...,hatespeech,[Asian],1179088797964763136,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech,{race}
4,this bitch in whataburger eating a burger with...,hatespeech,"[Caucasian, Women]",1179085312976445440,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech,"{race, gender}"


In [52]:
merged_dfs['extracted_target'].value_counts()


extracted_target
{gender}                                         11087
{race}                                            9800
{none}                                            7715
{religion}                                        7223
{sexuality}                                       5855
                                                 ...  
{race, religion, gender, disability}                 1
{sexuality, race, gender, nationality, other}        1
{religion, disability, gender, sexuality}            1
{religion, nationality, gender, other}               1
{race, disability, nationality, other}               1
Name: count, Length: 101, dtype: int64

In [53]:
merged_dfs['text'].value_counts()

text
<user> i hate you                                                                                                                                                                                                                                                                                                                                                                              14
<user> y are u focusing on that when nazis exist                                                                                                                                                                                                                                                                                                                                                4
<user> <number> bank islam                                                                                                                                                                                                     

In [54]:
merged_dfs = merged_dfs.drop_duplicates(subset=['text'])


In [55]:
merged_dfs['text'].value_counts()

text
i dont think im getting my baby them white 9 he has two white j and nikes not even touched                                                                                                                                                                          1
@soledadobrien Look at all the racist chanting. Photo snap them all and spread them around and see how they like that.                                                                                                                                              1
@AnotherBenHong Real cruelty! Real racism! URL                                                                                                                                                                                                                      1
@KatieDayXo You're brave! Sometimes I just can't stand the inevitable chatter that follows the truth. The best reaction I got was from a female lawyer who was describing business man culture and when I said I 

In [56]:
merged_dfs.shape

(65217, 10)

In [57]:
merged_dfs = merged_dfs.reset_index(drop=True)
merged_dfs['id_new'] = merged_dfs.index + 1

In [58]:
d = dtale.show(merged_dfs)
d.open_browser()

In [59]:
d.kill()

2025-04-01 14:55:47,003 - INFO     - Shutdown complete


In [60]:
merged_dfs.columns.to_list()

['text',
 'original_label',
 'original_target',
 'original_id',
 'platform',
 'original_dataset_title',
 'label_hatespeech_binary_offensive_not_included',
 'label_hatespeech_binary_offensive_included',
 'label_normal_offensive_hatespeech',
 'extracted_target',
 'id_new']

In [61]:
desired_order = ['id_new', 'text', 'extracted_target', 'label_hatespeech_binary_offensive_not_included',
                'label_hatespeech_binary_offensive_included','label_normal_offensive_hatespeech',
                'platform', 'original_dataset_title', 'original_id', 'original_label']
merged_dfs = merged_dfs[desired_order]

In [62]:
d = dtale.show(merged_dfs)
d.open_browser()

In [63]:
d.kill()

2025-04-01 14:55:47,424 - INFO     - Shutdown complete


In [64]:
save_dataframe_as_pickle(merged_dfs, PROCESSED_DATA_DIR.as_posix() + '/annotated_and_targeted_hatespeech.pkl')



DataFrame saved to /home/takosaga/Projects/master_thesis/data/processed/annotated_and_targeted_hatespeech.pkl


In [66]:
merged_dfs.to_csv(PROCESSED_DATA_DIR.as_posix() + '/annotated_and_targeted_hatespeech.csv', index=False)