# Merging Datasets

This notebook details the process of merging multiple datasets into a single, unified dataset.  

In [1]:
import pandas as pd
import numpy as np
import dtale
from master_thesis.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR, load_dataframe_from_pickle, save_dataframe_as_pickle

[32m2025-03-15 10:17:22.378[0m | [1mINFO    [0m | [36mmaster_thesis.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /home/takosaga/Projects/master_thesis[0m


In [2]:
prep_hatexplain_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_hatexplain_df.pkl')
prep_measuring_hate_speech_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_measuring_hate_speech_df.pkl')
prep_mlma_df = load_dataframe_from_pickle(INTERIM_DATA_DIR.as_posix() + '/prep_mlma_df.pkl')

In [3]:
prep_hatexplain_df.dtypes

text                                              object
original_label                                    object
original_target                                   object
original_id                                       object
platform                                          object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [4]:
prep_measuring_hate_speech_df.dtypes

original_id                                         int32
text                                               object
original_label                                    float64
platform                                           object
original_target                                    object
original_dataset_title                             object
label_hatespeech_binary_offensive_not_included     object
label_hatespeech_binary_offensive_included         object
label_normal_offensive_hatespeech                  object
dtype: object

In [10]:
prep_measuring_hate_speech_df['original_id'] = prep_measuring_hate_speech_df['original_id'].astype('object')
prep_measuring_hate_speech_df['original_label'] = prep_measuring_hate_speech_df['original_label'].astype('object')

In [11]:
prep_measuring_hate_speech_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
platform                                          object
original_target                                   object
original_dataset_title                            object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
dtype: object

In [5]:
prep_mlma_df.dtypes

original_id                                        int64
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [13]:
prep_mlma_df['original_id'] = prep_mlma_df['original_id'].astype('object')

In [14]:
prep_mlma_df.dtypes

original_id                                       object
text                                              object
original_label                                    object
original_dataset_title                            object
platform                                          object
label_hatespeech_binary_offensive_not_included    object
label_hatespeech_binary_offensive_included        object
label_normal_offensive_hatespeech                 object
original_target                                   object
dtype: object

In [9]:
print('Hatexplain shape:',prep_hatexplain_df.shape)
print('MLMA shape: ', prep_mlma_df.shape)
print('Measuring Hate Speech shape: ', prep_measuring_hate_speech_df.shape)

Hatexplain shape: (20148, 9)
MLMA shape:  (5646, 9)
Measuring Hate Speech shape:  (39495, 9)


In [15]:
merged_dfs = pd.concat([prep_hatexplain_df, prep_measuring_hate_speech_df, prep_mlma_df], ignore_index=True)

In [17]:
merged_dfs.head()

Unnamed: 0,text,original_label,original_target,original_id,platform,original_dataset_title,label_hatespeech_binary_offensive_not_included,label_hatespeech_binary_offensive_included,label_normal_offensive_hatespeech
0,i dont think im getting my baby them white 9 h...,normal,[None],1179055004553900032,twitter,HateXplain,not_hatespeech,normal,normal
1,we cannot continue calling ourselves feminists...,normal,[None],1179063826874032128,twitter,HateXplain,not_hatespeech,normal,normal
2,nawt yall niggers ignoring me,normal,[African],1178793830532956161,twitter,HateXplain,not_hatespeech,normal,normal
3,<user> i am bit confused coz chinese ppl can n...,hatespeech,[Asian],1179088797964763136,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech
4,this bitch in whataburger eating a burger with...,hatespeech,"[Caucasian, Women]",1179085312976445440,twitter,HateXplain,hatespeech,hatespeech/offensive,hatespeech


In [21]:
d = dtale.show(merged_dfs)
d.open_browser()

In [20]:
d.kill()

2025-03-15 12:28:12,789 - INFO     - Executing shutdown...
2025-03-15 12:28:12,790 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer


In [22]:
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
nf_6           9
nf_2           8
nf_4           7
nf_3           2
nf_1           2
Name: count, dtype: int64

In [23]:
values_to_remove = ['nf_6', 'nf_2', 'nf_4', 'nf_3', 'nf_1']


In [24]:
merged_dfs = merged_dfs[~merged_dfs['platform'].isin(values_to_remove)]
merged_dfs['platform'].value_counts()

platform
twitter    30148
reddit     15842
gab        11093
youtube     8178
Name: count, dtype: int64

In [25]:
merged_dfs['original_dataset_title'].value_counts()

original_dataset_title
Measuring Hate Speech    39495
HateXplain               20120
MLMA                      5646
Name: count, dtype: int64