In [3]:
import pandas as pd

In [4]:
jigsaw_df = pd.read_csv('../data/raw/jigsaw_dataset/train.csv')
jigsaw_df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
jigsaw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
jigsaw_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
jigsaw_df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [8]:
jigsaw_df.duplicated().sum()

np.int64(0)

In [9]:
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)                  # удаление ссылок
    text = re.sub(r"@\w+", "", text)                     # упоминания @username
    text = re.sub(r"[^A-Za-z0-9\s.,!?\'\"-]", "", text)  # удаление спецсимволов
    text = re.sub(r"\s+", " ", text).strip()             # пробелы
    return text

jigsaw_df['clean_text'] = jigsaw_df['comment_text'].apply(clean_text)


In [10]:
jigsaw_df['clean_text'] = jigsaw_df['clean_text'].str.lower()


In [11]:
jigsaw_df['text_len'] = jigsaw_df['clean_text'].apply(len)
jigsaw_df = jigsaw_df[jigsaw_df['text_len'] > 10]  # фильтруем слишком короткие


In [12]:
jigsaw_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_len
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,264
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,106
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...",233
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can't make any real suggestions on im...",615
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...",67


In [13]:
jigsaw_df['target_sum'] = jigsaw_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) 
jigsaw_df['has_toxicity'] = jigsaw_df['target_sum'] > 0
jigsaw_df['has_toxicity'] = jigsaw_df['has_toxicity'].astype(int)

In [21]:
jigsaw_df.to_csv('../data/processed/jigsaw_cleaned_train.csv')

In [14]:
toxic_count = jigsaw_df[jigsaw_df['has_toxicity'] == 1].count()[0]

  toxic_count = jigsaw_df[jigsaw_df['has_toxicity'] == 1].count()[0]


In [15]:
non_toxic_count = jigsaw_df[jigsaw_df['has_toxicity'] != 1].count()[0]

  non_toxic_count = jigsaw_df[jigsaw_df['has_toxicity'] != 1].count()[0]


In [16]:
non_toxic_count/(non_toxic_count+toxic_count) * 100 # большое кол-во не токсичных комментариев

np.float64(89.83212346886323)

In [17]:
toxic_df = jigsaw_df[jigsaw_df['has_toxicity'] == 1]
non_toxic_df = jigsaw_df[jigsaw_df['has_toxicity'] == 0]

non_toxic_df = non_toxic_df.sample(n=len(toxic_df), random_state=42)

balanced_df = pd.concat([toxic_df, non_toxic_df])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_df['has_toxicity'].value_counts())


has_toxicity
1    16220
0    16220
Name: count, dtype: int64


In [18]:
balanced_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_len,target_sum,has_toxicity
0,f0b8905bf8f17e9c,I Hate You \nI hate you SwisterTwister.,1,0,0,0,0,0,i hate you i hate you swistertwister.,37,1,1
1,8185a41faeb943a0,The article clearly sucks.,1,0,1,0,0,0,the article clearly sucks.,26,2,1
2,f108c7a7467d7719,"The problem has been solved, thank you for res...",0,0,0,0,0,0,"the problem has been solved, thank you for res...",54,0,0
3,bf82c8b8473745d4,Message\n\nHey Dx927 you're a fat ugly cunt wi...,0,0,1,0,1,0,message hey dx927 you're a fat ugly cunt with ...,158,2,1
4,64ef8ec1e1d371b7,"""\n\nImage Tagging Image:Utc logo.gif\n\n This...",0,0,0,0,0,0,""" image tagging imageutc logo.gif this media m...",1490,0,0


In [20]:
balanced_df.to_csv('../data/processed/jigsaw_balanced_train.csv', index=False)