### Datasets:
#### English
1. aalto train.tsv
#### German
2. https://github.com/germeval2021toxic/SharedTask
#### Finnish
3. https://huggingface.co/datasets/TurkuNLP/Suomi24-toxicity-annotated

### Import Libs

In [2]:
import pandas as pd

### Load Datasets

In [73]:
eng_df = pd.read_csv("data/train.tsv", sep='\t', header=0, quoting=3)
eng_df.head()

Unnamed: 0,id,text,label
0,eng_train0,I supported Barack Obama. I thought it was abs...,0
1,eng_train1,what to hell with that!,1
2,eng_train2,"and the stupidity of the haters continues, thi...",1
3,eng_train3,Alberta has been in debt under the Conservativ...,0
4,eng_train4,"The TV is in Channel Search mode, and I have p...",0


In [11]:
ger_df = pd.read_csv("data/GermEval21_TrainData.csv")
ger_df.head()

Unnamed: 0,comment_id,comment_text,Sub1_Toxic,Sub2_Engaging,Sub3_FactClaiming
0,1,Ziemlich traurig diese Kommentare zu lesen. Ih...,0,0,0
1,2,"Sag ich doch, wir befeuern den Klimawandel. Ra...",0,1,1
2,3,"Schublade auf, Schublade zu. Zu mehr Denkleist...",1,0,0
3,4,Dummerweise haben wir in der EU und in der USA...,0,0,1
4,5,"""So lange Gewinnmaximierung Vorrang hat, wird ...",0,0,0


In [12]:
fin_df = pd.read_csv("data/finnish_toxicity.tsv", sep='\t', header=0, quoting=3)
fin_df.head()

Unnamed: 0,ID,label,text
0,s24:55342ca8ce0d352026c8c0e53c70dc8b,toxicity,Persujen mamu-puputus kyllästyttää mutta mamu...
1,s24:4ca966fd3197849f902c2ee7f74f2205,toxicity,Suvakit ja hyysärit haluaa saada lapset omaan ...
2,s24:64844416909031b96232f1d6249d63f6,not-toxicity,"VIELÄ YKSI ASIA! NÄMÄ EI OLE HENKILÖVAALIT,VAA..."
3,s24:fa3e78cbbd4d0460c2abc68f274c2596,toxicity,Halosen aikana on lapsiin ja naisiin kohdistun...
4,s24:b3ba1cf93f83f502c287c344f09f7a11,toxicity,"Byää! Byää! Jos tulee vammoja, byää! Saatanan ..."


### Analysis

In [74]:
eng_df.info()
print(f"\n")
ger_df.info()
print(f"\n")
fin_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      99000 non-null  object
 1   text    99000 non-null  object
 2   label   99000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3244 entries, 0 to 3243
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   comment_id         3244 non-null   int64 
 1   comment_text       3244 non-null   object
 2   Sub1_Toxic         3244 non-null   int64 
 3   Sub2_Engaging      3244 non-null   int64 
 4   Sub3_FactClaiming  3244 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 126.8+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  

### Data Cleaning and Reshaping
1. Clean the datasets
2. Reshape to fit the training datasets of Jigsaw
3. Create 3 different training datasets of size 100, 200, 350 of each lang

In [88]:
def rename_ids(df, lang_prefix):
    if df is not None and not df.empty:
        df = df.reset_index(drop=True)
        df['id'] = [f"{lang_prefix}_train_{i}" for i in df.index]
        cols = ['id'] + [col for col in df.columns if col != 'id']
    return df

In [91]:
eng_df_cleaned = eng_df.copy()
eng_df_cleaned.rename(columns={
    'text': 'comment_text',
    'label': 'toxic'
}, inplace=True)

eng_df_cleaned = rename_ids(eng_df_cleaned, 'eng')
eng_df_cleaned.info()
eng_df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99000 entries, 0 to 98999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            99000 non-null  object
 1   comment_text  99000 non-null  object
 2   toxic         99000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


Unnamed: 0,id,comment_text,toxic
0,eng_train_0,I supported Barack Obama. I thought it was abs...,0
1,eng_train_1,what to hell with that!,1
2,eng_train_2,"and the stupidity of the haters continues, thi...",1
3,eng_train_3,Alberta has been in debt under the Conservativ...,0
4,eng_train_4,"The TV is in Channel Search mode, and I have p...",0


In [95]:
ger_df_cleaned = ger_df[['comment_id','comment_text', 'Sub1_Toxic']].copy()
ger_df_cleaned.rename(columns={
    'comment_id': 'id',
    'Sub1_Toxic': 'toxic'
}, inplace=True)
ger_df_cleaned = rename_ids(ger_df_cleaned, 'ger')

ger_df_cleaned.info()
ger_df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3244 entries, 0 to 3243
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3244 non-null   object
 1   comment_text  3244 non-null   object
 2   toxic         3244 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 76.2+ KB


Unnamed: 0,id,comment_text,toxic
0,ger_train_0,Ziemlich traurig diese Kommentare zu lesen. Ih...,0
1,ger_train_1,"Sag ich doch, wir befeuern den Klimawandel. Ra...",0
2,ger_train_2,"Schublade auf, Schublade zu. Zu mehr Denkleist...",1
3,ger_train_3,Dummerweise haben wir in der EU und in der USA...,0
4,ger_train_4,"""So lange Gewinnmaximierung Vorrang hat, wird ...",0


In [93]:
clean_labels = ['toxicity', 'not-toxicity', 'severe_toxicity']
fin_df_cleaned = fin_df[fin_df['label'].isin(clean_labels)].copy()

label_map = {'toxicity': 1, 'severe_toxicity': 1, 'not-toxicity': 0}
fin_df_cleaned['toxic'] = fin_df_cleaned['label'].map(label_map)

fin_df_cleaned.rename(columns={
            'ID': 'id',
            'text': 'comment_text'
}, inplace=True)
fin_df_cleaned.drop(columns=['label'], inplace=True)
fin_df_cleaned = rename_ids(fin_df_cleaned, 'fin')

fin_df_cleaned.info()
fin_df_cleaned.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            376 non-null    object
 1   comment_text  376 non-null    object
 2   toxic         376 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 8.9+ KB


Unnamed: 0,id,comment_text,toxic
0,fin_train_0,Persujen mamu-puputus kyllästyttää mutta mamu...,1
1,fin_train_1,Suvakit ja hyysärit haluaa saada lapset omaan ...,1
2,fin_train_2,"VIELÄ YKSI ASIA! NÄMÄ EI OLE HENKILÖVAALIT,VAA...",0
3,fin_train_3,Halosen aikana on lapsiin ja naisiin kohdistun...,1
4,fin_train_4,"Byää! Byää! Jos tulee vammoja, byää! Saatanan ...",1


In [82]:
fin_duplicates_count = fin_df_cleaned['comment_text'].duplicated().sum()
print(f"Number of duplicate: {fin_duplicates_count}")

Number of duplicate: 0


In [101]:
train_sizes = [100, 200, 350]
train_dfs = {} 

for n, train_size in enumerate(train_sizes):
    df_key = f"train_df_{train_size*3}"

    # Concatenate the head of each dataframe using the correct train_size
    train_dfs[df_key] = pd.concat(
        [
            eng_df_cleaned.head(train_size), 
            ger_df_cleaned.head(train_size),
            fin_df_cleaned.head(train_size)
        ],
        ignore_index=True
    )

train_dfs['train_df_300'].head(300)

Unnamed: 0,id,comment_text,toxic
0,eng_train_0,I supported Barack Obama. I thought it was abs...,0
1,eng_train_1,what to hell with that!,1
2,eng_train_2,"and the stupidity of the haters continues, thi...",1
3,eng_train_3,Alberta has been in debt under the Conservativ...,0
4,eng_train_4,"The TV is in Channel Search mode, and I have p...",0
...,...,...,...
295,fin_train_95,"""Siis jopas on typerää! Vain Ari Koivusen fani...",1
296,fin_train_96,http://karavaanari.info/index.php?option=com_s...,0
297,fin_train_97,otetaanko huomioon myös raiskatut ja ylihyväil...,1
298,fin_train_98,Ptörä irti ja kiristät yhtä mutteria siellä va...,0


### Export to TSV 

In [102]:
for df_key, df_value in train_dfs.items():
    file_name = f"{df_key}.tsv"
    file_root = 'data'
    file_loc = f'{file_root}/{file_name}'
    
    df_value.to_csv(file_loc, sep='\t', index=False)
    print(f"Successfully exported DataFrame '{df_key}' to file '{file_name}'")

Successfully exported DataFrame 'train_df_300' to file 'train_df_300.tsv'
Successfully exported DataFrame 'train_df_600' to file 'train_df_600.tsv'
Successfully exported DataFrame 'train_df_1050' to file 'train_df_1050.tsv'
