# Simplify the problem from multilabel to multiclass

Load original data

In [1]:
import pandas as pd
train = pd.read_csv("data/toxic_train.csv.zip", index_col="id")
test = pd.read_csv("data/toxic_test.csv.zip", index_col="id")

In [2]:
train.head(10)

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
e0fdfd98c66fb643,"""\n\n Huggle not working \n\nHi Gurch. There i...",0,0,0,0,0,0
1864753b5fb6c9a3,Mossad actually. I know where you live.,0,0,0,0,0,0
ce1db53fb22d399c,REDIRECT Talk:UFC Fight Night: Belfort vs. Hen...,0,0,0,0,0,0
fed4f08d59399398,"""\n\nUPA IRC\nWhat about 19:00 UTC? e | ταλκ """,0,0,0,0,0,0
06e7f93938ad9e72,"""\nI've re-added your information, together wi...",0,0,0,0,0,0
4a5e851879fdd674,"""\nI'm not an elitist, I'm just spreading the ...",0,0,0,0,0,0
ff39db4975a78363,"""\n\nIt is not listed on this European list as...",0,0,0,0,0,0
73cc03c5e157ce86,You made a mistake you ass.,1,0,1,0,0,0
ca0891e20b7bbd66,Lol dynamic IP. Just you try to stop me! 82.13...,0,0,0,0,0,0
b890cc6153e51480,"""Thanks for trying to fix Neil Steinberg. I ju...",0,0,0,0,0,0


Compute label statistics and prioritize them by rarity

In [3]:
all_labels = train.columns[1:].tolist()
all_labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
ranked_labels = train.drop("comment_text", axis=1).sum(axis=0).sort_values().index.tolist()
ranked_labels

['threat', 'identity_hate', 'severe_toxic', 'insult', 'obscene', 'toxic']

Recode labels: for texts with one or more labels, choose the most infrequent.

In [5]:
def recode_labels(labels):
    if not any(labels):
        return "normal"
    for label in ranked_labels:
        index = all_labels.index(label)
        if labels[index]:
            return label

In [6]:
train["toxicity"] = train.drop("comment_text", axis=1).apply(recode_labels, axis=1)
test["toxicity"] = test.drop("comment_text", axis=1).apply(recode_labels, axis=1)
train.head(10)

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxicity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
e0fdfd98c66fb643,"""\n\n Huggle not working \n\nHi Gurch. There i...",0,0,0,0,0,0,normal
1864753b5fb6c9a3,Mossad actually. I know where you live.,0,0,0,0,0,0,normal
ce1db53fb22d399c,REDIRECT Talk:UFC Fight Night: Belfort vs. Hen...,0,0,0,0,0,0,normal
fed4f08d59399398,"""\n\nUPA IRC\nWhat about 19:00 UTC? e | ταλκ """,0,0,0,0,0,0,normal
06e7f93938ad9e72,"""\nI've re-added your information, together wi...",0,0,0,0,0,0,normal
4a5e851879fdd674,"""\nI'm not an elitist, I'm just spreading the ...",0,0,0,0,0,0,normal
ff39db4975a78363,"""\n\nIt is not listed on this European list as...",0,0,0,0,0,0,normal
73cc03c5e157ce86,You made a mistake you ass.,1,0,1,0,0,0,obscene
ca0891e20b7bbd66,Lol dynamic IP. Just you try to stop me! 82.13...,0,0,0,0,0,0,normal
b890cc6153e51480,"""Thanks for trying to fix Neil Steinberg. I ju...",0,0,0,0,0,0,normal


In [7]:
train = train[["comment_text", "toxicity"]]
test = test[["comment_text", "toxicity"]]
train.head(10)

Unnamed: 0_level_0,comment_text,toxicity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e0fdfd98c66fb643,"""\n\n Huggle not working \n\nHi Gurch. There i...",normal
1864753b5fb6c9a3,Mossad actually. I know where you live.,normal
ce1db53fb22d399c,REDIRECT Talk:UFC Fight Night: Belfort vs. Hen...,normal
fed4f08d59399398,"""\n\nUPA IRC\nWhat about 19:00 UTC? e | ταλκ """,normal
06e7f93938ad9e72,"""\nI've re-added your information, together wi...",normal
4a5e851879fdd674,"""\nI'm not an elitist, I'm just spreading the ...",normal
ff39db4975a78363,"""\n\nIt is not listed on this European list as...",normal
73cc03c5e157ce86,You made a mistake you ass.,obscene
ca0891e20b7bbd66,Lol dynamic IP. Just you try to stop me! 82.13...,normal
b890cc6153e51480,"""Thanks for trying to fix Neil Steinberg. I ju...",normal


In [8]:
from collections import Counter
Counter(train["toxicity"])

Counter({'normal': 107438,
         'obscene': 1544,
         'insult': 4197,
         'identity_hate': 984,
         'toxic': 4231,
         'threat': 370,
         'severe_toxic': 914})

Save new dataset

In [12]:
train.to_csv("data/toxic_multiclass_train.csv.zip", compression="zip")
test.to_csv("data/toxic_multiclass_test.csv.zip", compression="zip")

Test saved dataset

In [11]:
df = pd.read_csv("data/toxic_multiclass_train.csv.zip", index_col="id")
df.head(10)

Unnamed: 0_level_0,comment_text,toxicity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e0fdfd98c66fb643,"""\n\n Huggle not working \n\nHi Gurch. There i...",normal
1864753b5fb6c9a3,Mossad actually. I know where you live.,normal
ce1db53fb22d399c,REDIRECT Talk:UFC Fight Night: Belfort vs. Hen...,normal
fed4f08d59399398,"""\n\nUPA IRC\nWhat about 19:00 UTC? e | ταλκ """,normal
06e7f93938ad9e72,"""\nI've re-added your information, together wi...",normal
4a5e851879fdd674,"""\nI'm not an elitist, I'm just spreading the ...",normal
ff39db4975a78363,"""\n\nIt is not listed on this European list as...",normal
73cc03c5e157ce86,You made a mistake you ass.,obscene
ca0891e20b7bbd66,Lol dynamic IP. Just you try to stop me! 82.13...,normal
b890cc6153e51480,"""Thanks for trying to fix Neil Steinberg. I ju...",normal
