In [None]:
import pandas as pd

df = pd.read_csv('analysis_sheets/all_data.csv', on_bad_lines='skip')

df.head()

In [56]:
df = df[['id', 'comment_text', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat']]
df.dropna(inplace=True)
df.shape, df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1999512 entries, 0 to 1999515
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   comment_text     object 
 2   toxicity         float64
 3   severe_toxicity  float64
 4   obscene          float64
 5   sexual_explicit  float64
 6   identity_attack  float64
 7   insult           float64
 8   threat           float64
dtypes: float64(7), int64(1), object(1)
memory usage: 152.6+ MB


((1999512, 9), None)

In [57]:
df.columns

Index(['id', 'comment_text', 'toxicity', 'severe_toxicity', 'obscene',
       'sexual_explicit', 'identity_attack', 'insult', 'threat'],
      dtype='object')

### All the further data analysis will be done here.

In [59]:
df = df[~((df['toxicity'] <= 0.5) & ((df['severe_toxicity'] > 0) | (df['obscene'] > 0) | (df['sexual_explicit'] > 0) | (df['identity_attack'] > 0) | (df['insult'] > 0) | (df['threat'] > 0)))]
df = df[~((df['toxicity'] > 0.5) & ((df['severe_toxicity'] == 0) & (df['obscene'] == 0) & (df['sexual_explicit'] == 0) & (df['identity_attack'] == 0) & (df['insult'] == 0) & (df['threat'] == 0)))]
df.shape

(1518502, 9)

In [60]:
sub_attrs = ['obscene', 'sexual_explicit','threat', 'insult', 'identity_attack']

for attr in sub_attrs:
    df[attr] = df[attr].apply(lambda x: 1 if x > 0.0 else 0)
    
df['toxicity'] = df['toxicity'].apply(lambda x: 1 if x > 0.5 else 0)

df = df.groupby('toxicity').apply(lambda x: x.sample(25000)).reset_index(drop=True)
print(df.shape)

def label_distribution(df):
    for attr in sub_attrs:
        print(df[attr].value_counts())
    print(df['toxicity'].value_counts())
    

(50000, 9)


In [61]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, stratify=df['toxicity'])
train, dev = train_test_split(train, test_size=0.1, stratify=train['toxicity'])

train.shape, test.shape, dev.shape

((36000, 9), (10000, 9), (4000, 9))

In [62]:

label_distribution_table = pd.DataFrame(columns=['train', 'dev', 'test'])
for attr in sub_attrs:
    label_distribution_table.loc[attr] = [train[attr].sum(), dev[attr].sum(), test[attr].sum()]
label_distribution_table.loc['toxicity'] = [train['toxicity'].sum(), dev['toxicity'].sum(), test['toxicity'].sum()]
label_distribution_table

Unnamed: 0,train,dev,test
obscene,11997,1348,3333
sexual_explicit,4098,479,1147
threat,5499,588,1487
insult,17741,1971,4929
identity_attack,10441,1150,2898
toxicity,18000,2000,5000


In [9]:

def label_distribution_table(df):
    label_distribution_table = pd.DataFrame(columns=['positive', 'negative'])
    for attr in sub_attrs:
        label_distribution_table.loc[attr] = [df[attr].sum(), len(df) - df[attr].sum()]
    label_distribution_table.loc['toxicity'] = [df['toxicity'].sum(), len(df) - df['toxicity'].sum()]
    return label_distribution_table

label_distribution_table(train)

Unnamed: 0,positive,negative
obscene,6592,29408
sexual_explicit,4146,31854
threat,6327,29673
insult,16136,19864
identity_attack,13311,22689
toxicity,4937,31063


In [64]:
label_distribution_table(dev)

Unnamed: 0,positive,negative
obscene,1348,2652
sexual_explicit,479,3521
threat,588,3412
insult,1971,2029
identity_attack,1150,2850
toxicity,2000,2000


In [65]:
label_distribution_table(test)

Unnamed: 0,positive,negative
obscene,3333,6667
sexual_explicit,1147,8853
threat,1487,8513
insult,4929,5071
identity_attack,2898,7102
toxicity,5000,5000


In [37]:
train.sample(10)

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat
9050,5185643,remember .. there's never just one cockroach,0,0.0,0,0,0,0,0
66,6113355,Just being dialogical and clarifying the teach...,0,0.0,0,0,0,0,0
20445,901465,Isn't it nice to have a President who campaign...,0,0.0,0,0,0,0,0
35890,1006573,"'\nI'm glad you brought that up.\n.\n…yep, I h...",1,0.026316,1,1,1,1,1
46436,469308,What a sick and twisted comment. You are demen...,1,0.075758,1,0,1,1,1
22922,5698084,"As Mr. Spock would say, ""Fascinating!""",0,0.0,0,0,0,0,0
17033,6036849,There is no longer any doubt that Mueller's of...,0,0.0,0,0,0,0,0
37901,372027,It means treating women like mangy dogs and th...,1,0.15,1,1,1,1,1
48522,5332648,Bingo.\n\nThis is actually a bad news story ou...,1,0.0,0,0,0,1,0
27559,5640442,The power of the Potomac can only be realized ...,1,0.015625,1,0,1,1,1


In [66]:
# save the data
train.to_csv('dataset/train.csv', index=False)
dev.to_csv('dataset/dev.csv', index=False)
test.to_csv('dataset/test.csv', index=False)

In [2]:
import pandas as pd

train = pd.read_csv('dataset/train.csv')
train.columns

Index(['id', 'comment_text', 'toxicity', 'severe_toxicity', 'obscene',
       'sexual_explicit', 'identity_attack', 'insult', 'threat'],
      dtype='object')

In [12]:
import pandas as pd
from datasets import load_dataset
from sklearn.utils import resample

dataset = load_dataset('civil_comments')

df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()
df_valid = dataset['validation'].to_pandas()

df = pd.concat([df_train, df_test, df_valid], ignore_index=True)

df = df[~((df['toxicity'] <= 0.5) & ((df['severe_toxicity'] > 0) | (df['obscene'] > 0) | (df['sexual_explicit'] > 0) | (df['identity_attack'] > 0) | (df['insult'] > 0) | (df['threat'] > 0)))]

df = df[~((df['toxicity'] > 0.5) & ((df['severe_toxicity'] == 0) & (df['obscene'] == 0) & (df['sexual_explicit'] == 0) & (df['identity_attack'] == 0) & (df['insult'] == 0) & (df['threat'] == 0)))]

df = df[['text', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
         'identity_attack', 'insult', 'threat']]

df.dropna(inplace=True)

df.reset_index(inplace=True)
df.rename(columns={'index': 'id', 'text': 'comment_text'}, inplace=True)

sub_attrs = ['obscene', 'sexual_explicit', 'threat', 'insult', 'identity_attack']

for attr in sub_attrs:
    df[attr] = df[attr].apply(lambda x: 1 if x > 0.0 else 0)

df['toxicity'] = df['toxicity'].apply(lambda x: 1 if x > 0.5 else 0)

balanced_datasets = []

for attr in sub_attrs:

    df_pos = df[df[attr] == 1]
    df_neg = df[df[attr] == 0]

    min_count = min(len(df_pos), len(df_neg))

    df_pos_sampled = df_pos.sample(n=min_count, random_state=42)
    df_neg_sampled = df_neg.sample(n=min_count, random_state=42)

    df_balanced = pd.concat([df_pos_sampled, df_neg_sampled], ignore_index=True)

    balanced_datasets.append(df_balanced)

df_combined = pd.concat(balanced_datasets, ignore_index=True)
df_combined.drop_duplicates(subset='id', inplace=True)
df_combined.reset_index(drop=True, inplace=True)

def label_distribution(df):
    distribution = {}
    for attr in sub_attrs + ['toxicity']:
        distribution[attr] = df[attr].value_counts().to_dict()
    return distribution

label_distr = label_distribution(df_combined)
print("Label distributions in the combined dataset:")
for label, counts in label_distr.items():
    print(f"{label}: {counts}")

from sklearn.model_selection import train_test_split

train, test = train_test_split(df_combined, test_size=0.2, random_state=42)
train, dev = train_test_split(train, test_size=0.1, random_state=42)

def label_distribution_table(df_list, names):
    table = pd.DataFrame()
    for df, name in zip(df_list, names):
        counts = {}
        for attr in sub_attrs + ['toxicity']:
            counts[attr] = df[attr].sum()
        table[name] = counts.values()
    table.index = counts.keys()
    return table

label_table = label_distribution_table([train, dev, test], ['train', 'dev', 'test'])
print("\nLabel distribution across splits:")
print(label_table)


Label distributions in the combined dataset:
obscene: {0: 1177758, 1: 163043}
sexual_explicit: {0: 1264679, 1: 76122}
threat: {0: 1222620, 1: 118181}
insult: {0: 837553, 1: 503248}
identity_attack: {0: 1099283, 1: 241518}
toxicity: {0: 1222926, 1: 117875}

Label distribution across splits:
                  train    dev    test
obscene          117677  12971   32395
sexual_explicit   55183   6029   14910
threat            85340   9471   23370
insult           362424  40337  100487
identity_attack  173962  19375   48181
toxicity          85052   9442   23381


In [13]:
train.to_csv('dataset/train_big.csv', index=False)

In [8]:
import pandas as pd

train = pd.read_csv('dataset/train.csv')
val = pd.read_csv('dataset/dev.csv')
test = pd.read_csv('dataset/test.csv')
sub_attrs = ['obscene', 'sexual_explicit', 'threat', 'insult', 'identity_attack']

def label_distribution_table(df):
    label_distribution_table = pd.DataFrame(columns=['positive', 'negative'])
    for attr in sub_attrs:
        label_distribution_table.loc[attr] = [df[attr].sum(), len(df) - df[attr].sum()]
    label_distribution_table.loc['toxicity'] = [df['toxicity'].sum(), len(df) - df['toxicity'].sum()]
    return label_distribution_table

label_distribution_table(train)

Unnamed: 0,positive,negative
obscene,11997,24003
sexual_explicit,4098,31902
threat,5499,30501
insult,17741,18259
identity_attack,10441,25559
toxicity,18000,18000


In [10]:
train.shape, val.shape, test.shape

((36000, 9), (4000, 9), (10000, 9))