In [1]:
# Reload modules before executing user code
%reload_ext autoreload
# Reload all modules (except those excluded by %aimport)
%autoreload 2
# Show plots within this notebook
%matplotlib inline

# 1. Load our training and test data into pandas dataframes

In [2]:
PATH='download/'
test_csv = f'{PATH}test.csv'
train_csv = f'{PATH}train.csv'
sample_submission_csv = f'{PATH}sample_submission.csv'

In [3]:
import pandas as pd

train_dataframe = pd.read_csv(train_csv, na_filter=False)
test_dataframe = pd.read_csv(test_csv, na_filter=False)

# 2. View data

The labels are all in the same scale and won't need to be standardized.

In [4]:
train_dataframe.loc[train_dataframe['threat'] == 1].head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
79,003217c3eb469ba9,Hi! I am back again!\nLast warning!\nStop undo...,1,0,0,1,0,0


In [5]:
train_dataframe.shape

(159571, 8)

In [6]:
train_dataframe.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
train_dataframe.info() # verify that are no missing values in our dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


# 4. Separate target features (y) from input features (X) 


Use sklearn.model_selection.train_test_split to split training data into validation and train. 

In [8]:
X = train_dataframe['comment_text']
ys = train_dataframe[['obscene','insult','toxic','severe_toxic','identity_hate','threat']]

from sklearn.model_selection import train_test_split 
X_train, X_valid, y_train, y_valid = train_test_split(X, ys, test_size=0.2, random_state=1)

# 5 Tokenize words from comments

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
tfidf_matrix = vectorizer.fit_transform(X_train)

In [10]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names()).head()

Unnamed: 0,00,000,0000,00000,000000,0000000,0000000027,00000001,00000003,00000050,...,천리마군,칠지도,ﬂute,ａｎｏｎｔａｌｋ,ｃｏｍ,ｗｗｗ,ｳｨｷﾍﾟﾃﾞｨｱ,𐌰𐌹,𐌰𐌿,𐌴𐌹
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
if not open(f'vocab.pkl','wb'):
    pickle.dump(vectorizer, open(f'vocab.pkl','wb'))

# 6. Problem transformation

Train one binary classifier for each label. This is called binary relevance. 