In [2]:
import json
import pandas as pd
import numpy as np
import dill as pickle
from functools import lru_cache
from tqdm import tqdm as tqdm
import os,sys,time
from sklearn.model_selection import StratifiedKFold

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

In [3]:
# load data
iformat = 'csv'
oformat = 'csv'
DataBase = '../data'
DataSet = {}
start = time.time()
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/raw/%s.%s' % (DataBase, mod, iformat), oformat)
    DataSet[mod]['comment_text'] = DataSet[mod]['comment_text'].fillna('nan')
    DataSet[mod]['total_length'] = DataSet[mod]['comment_text'].apply(len)
    DataSet[mod]['capitals'] = DataSet[mod]['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    DataSet[mod]['caps_vs_length'] = DataSet[mod].apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    DataSet[mod]['num_exclamation_marks'] = DataSet[mod]['comment_text'].apply(lambda comment: comment.count('!'))
    DataSet[mod]['num_question_marks'] = DataSet[mod]['comment_text'].apply(lambda comment: comment.count('?'))
    DataSet[mod]['num_punctuation'] = DataSet[mod]['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '.,;:'))
    DataSet[mod]['imcomplete_punctuation'] = DataSet[mod]['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '*,#,$'))    
    DataSet[mod]['question_mask_ratio'] = DataSet[mod]['num_question_marks']/DataSet[mod]['total_length']
    DataSet[mod]['exclamation_mark_ratio'] = DataSet[mod]['num_exclamation_marks']/DataSet[mod]['total_length']
    DataSet[mod]['imcomplete_punctuation_ratio'] = DataSet[mod]['imcomplete_punctuation']/DataSet[mod]['total_length']
    
    DataSet[mod]['num_words'] = DataSet[mod]['comment_text'].apply(lambda comment: len(comment.split()))
    DataSet[mod]['num_unique_words'] = DataSet[mod]['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    DataSet[mod]['words_vs_unique'] = DataSet[mod]['num_unique_words'] / DataSet[mod]['num_words']
    DataSet[mod]['num_smilies'] = DataSet[mod]['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    DataSet[mod]['similes_ratio'] = DataSet[mod]['num_smilies'] / DataSet[mod]['num_words']
    
    DataSet[mod].fillna(.0, inplace= True)
end = time.time()
print('time elapsed %s' % (end - start))

time elapsed 15.90625


In [4]:
label2binary = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1],
    [0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 1],
    [0, 0, 1, 0, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1, 1],
    [0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0],
    [0, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 1],
    [0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 1, 1],
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1, 1],
    [1, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 1, 0],
    [1, 0, 0, 0, 1, 1],
    [1, 0, 0, 1, 0, 0],
    [1, 0, 0, 1, 0, 1],
    [1, 0, 0, 1, 1, 0],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0],
    [1, 0, 1, 0, 1, 1],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 1],
    [1, 0, 1, 1, 1, 0],
    [1, 0, 1, 1, 1, 1],
    [1, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 1],
    [1, 1, 0, 0, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 0, 1],
    [1, 1, 0, 1, 1, 0],
    [1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0],
    [1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
])

colnames = ['total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks', 'num_question_marks', 
               'num_punctuation', 'imcomplete_punctuation', 'question_mask_ratio', 'exclamation_mark_ratio', 
               'imcomplete_punctuation_ratio', 'num_words', 'num_unique_words', 'words_vs_unique', 'num_smilies',
               'similes_ratio']
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
OutputDir = '../data/meta/kfold/'
strategy = 'artificial'
kfold = 3

def cv(X, y, l2b, n_splits=3):
    def split(X, y):
        return StratifiedKFold(n_splits=n_splits).split(X, y)
    
    def convert_y(y):
        new_y = np.zeros([len(y)])
        for i, val in enumerate(l2b):
            idx = (y == val).max(axis=1)
            new_y[idx] = i
        return new_y
    
    fold = 0
    for train, test in tqdm(split(X, convert_y(y.values)), total=n_splits):
        FoldOutput = '%s/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutput) == False):
            os.makedirs(FoldOutput)
        FoldOutputFile = '%s/valid_%s.csv' % (FoldOutput, strategy)
        X_valid = X.loc[test,].copy()
        y_valid = y.loc[test,].copy()
        df = pd.DataFrame(index= range(len(X_valid)))
        df[colnames] = X_valid
        df[targets] = y_valid
        DataUtil2.save(df, FoldOutputFile, 'csv', 4)
        print('save fold %s done.' % fold)
        fold += 1

cv(DataSet['train'][colnames], DataSet['train'][targets], label2binary, kfold)

#for fold in range(kfold):
FoldOutput = '../data/meta/submit'
if(os.path.exists(FoldOutput) == False):
    os.makedirs(FoldOutput)
FoldOutputFile = '%s/test_%s.csv' % (FoldOutput, strategy)
OutputCols = ['id']
OutputCols.extend(colnames)
OutputCols.extend(targets)
for t in targets:
    DataSet['test'][t] = .0
DataUtil2.save(DataSet['test'][OutputCols], FoldOutputFile, 'csv', 4)
print('save test done.')

100%|██████████| 3/3 [00:00<00:00,  5.11it/s]

save fold 0 done.
save fold 1 done.
save fold 2 done.





save test done.
