In [76]:
import numpy as np
import pandas as pd
from functools import lru_cache
from tqdm import tqdm as tqdm
import os,sys,time
from sklearn.model_selection import StratifiedKFold

## Bin based Stratified k-fold
label2binary = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1],
    [0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 1],
    [0, 0, 1, 0, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1, 1],
    [0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0],
    [0, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 1],
    [0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 1, 1],
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1, 1],
    [1, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 1, 0],
    [1, 0, 0, 0, 1, 1],
    [1, 0, 0, 1, 0, 0],
    [1, 0, 0, 1, 0, 1],
    [1, 0, 0, 1, 1, 0],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0],
    [1, 0, 1, 0, 1, 1],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 1],
    [1, 0, 1, 1, 1, 0],
    [1, 0, 1, 1, 1, 1],
    [1, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 1],
    [1, 1, 0, 0, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 0, 1],
    [1, 1, 0, 1, 1, 0],
    [1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0],
    [1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
])

def convert_y(y):
    new_y = -1
    c = y.sum(axis= 1)
    for i, val in enumerate(label2binary):
        if(np.dot(y, val) == c):
            new_y = i
            break
    return new_y

#y = [0,0,1,0,1,0]
## reshape
#y = np.array(y).reshape((1, 6))
#new_y = convert_y(y)

# load data
iformat = 'csv'
DataBase = '../../data'
DataSet = {}
for mod in ['train', 'test']:
    f = '%s/raw/%s.%s' % (DataBase, mod, iformat)
    DataSet[mod] = pd.read_csv(f)
    DataSet[mod]['comment_text'] = DataSet[mod]['comment_text'].fillna('nan')
print('load data done. train %s, test %s' % (len(DataSet['train']), len(DataSet['test'])))

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def convert_Y(Y):
    n = len(Y)
    new_Y = np.zeros((n, 1))
    for i in range(n):
        y = np.array(Y[i]).reshape((1, 6))
        new_Y[i][0] = convert_y(y)
    return new_Y

OutputDir = '../../data/version1/l0'
# X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
# Y = np.array([0, 0, 1, 1])
##
print('leave one holdout ...')
X = DataSet['train'][['id', 'comment_text']].values
Y = DataSet['train'][targets].values
n = len(X)
kfold = 10
print('shape: ')
print(X.shape, Y.shape)
for train_index, valid_index in StratifiedKFold(n_splits= kfold).split(X, convert_Y(Y).reshape(n,)):
    DataSet['train'] = DataSet['train'].iloc[train_index]#.reset_index(drop= True)
    ValidData = DataSet['train'].iloc[valid_index]
    break
print('train %s, holdout %s' % (len(DataSet['train']), len(ValidData)))

##
print('CV ...')
X = DataSet['train'][['id', 'comment_text']].values
Y = DataSet['train'][targets].values
n = len(X)
kfold = 3
fold = 0
print('shape: ')
print(X.shape, Y.shape)
for train_index, valid_index in StratifiedKFold(n_splits= kfold).split(X, convert_Y(Y).reshape(n,)):
    print('fold %s, train %s, valid %s' % (fold, len(train_index), len(valid_index)))
    FoldOutput = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutput) == False):
        os.makedirs(FoldOutput)
    DataSet['train'].iloc[valid_index].to_csv('%s/valid.csv' % FoldOutput, index= False)
    ValidData.to_csv('%s/holdout.csv' % FoldOutput, index= False)
    DataSet['test'].to_csv('%s/test.csv' % FoldOutput, index= False)
    ## check
    print('-----------------------------------------')
    print(DataSet['train'].iloc[train_index][targets].sum(axis= 0)/len(train_index))
    print(DataSet['train'].iloc[valid_index][targets].sum(axis= 0)/len(valid_index))
    print('-----------------------------------------\n')
    fold += 1

load data done. train 159571, test 153164
leave one holdout ...
shape: 
(159571, 2) (159571, 6)




train 143591, holdout 15980
CV ...
shape: 
(143591, 2) (143591, 6)




fold 0, train 95716, valid 47875
-----------------------------------------
toxic            0.095689
severe_toxic     0.009915
obscene          0.052823
threat           0.002873
insult           0.049239
identity_hate    0.008692
dtype: float64
toxic            0.095875
severe_toxic     0.009984
obscene          0.053013
threat           0.002987
insult           0.049399
identity_hate    0.008856
dtype: float64
-----------------------------------------

fold 1, train 95727, valid 47864
-----------------------------------------
toxic            0.095741
severe_toxic     0.009935
obscene          0.052900
threat           0.002904
insult           0.049296
identity_hate    0.008744
dtype: float64
toxic            0.095771
severe_toxic     0.009945
obscene          0.052858
threat           0.002925
insult           0.049285
identity_hate    0.008754
dtype: float64
-----------------------------------------

fold 2, train 95739, valid 47852
-----------------------------------------
toxic