In [28]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return

In [29]:
# load data
iformat = 'hdf'
DataBase = '../data'
DataSet = {}
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/bow/%s.%s' % (DataBase, mod, iformat), iformat)
print('load data done.')

load data done.


In [30]:
from sklearn import feature_extraction
from sklearn import model_selection
import numpy as np
import sys,os,time

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

np.random.seed(2017)
DataSet['train'] = DataSet['train'].sample(frac= 0.1)
DataSet['test'] = DataSet['test'].sample(frac= 0.1)
## resampling bootstrap
R = 3 # bootstrap number
RRate = 0.9 # resampling rate
HRate = 0.1 # holdout rate
K = 5 # folds number
COMMENT = 'comment_text'
TrainData = DataSet['train']
TestData = DataSet['test']
for i in range(R):
    BoostrapOutputDir = '%s/bootstrap/%s' % (DataBase, i)
    ## sampling
    ReSampledTrainData = TrainData.sample(frac= RRate, random_state= 2017 + i)
    print('sampled data rate %s/%s' % (len(ReSampledTrainData), len(TrainData)))
    ## leave-one-out(holdout) strategy for local validation
    np.random.seed(2017 + i)
    msk = np.random.rand(len(ReSampledTrainData)) < HRate
    holdout = ReSampledTrainData[msk]
    train = ReSampledTrainData[~msk]
    print('LOO data rate %s/%s' % (len(holdout), len(ReSampledTrainData)))
    ## CV before engineering in case of data-leak for feature/model selection
    kf = model_selection.KFold(n_splits = K, random_state = 2017 + i, shuffle = True)
    OutputDir = '%s/l0' % BoostrapOutputDir
    if(os.path.exists('%s/kfold' % OutputDir) == False):
        os.makedirs('%s/kfold' % OutputDir)
    start = time.time()
    for fold, (train_index, test_index) in enumerate(kf.split(train)):

        print('train/test = %s/%s' % (len(train_index), len(test_index)))
        FoldTrain, FoldValid = train.iloc[train_index].copy(), train.iloc[test_index].copy()
        FoldHoldout = holdout.copy()
        FoldTest = TestData.copy()
        print('valid data rate %s/%s' % (len(FoldValid), len(train)))
        ## TODO
        # log1p length
        FoldTrain['total_log1p_len'] = np.log1p(FoldTrain[COMMENT].str.len())
        # none label
        FoldTrain['none'] = 1 - FoldTrain[label_cols].max(axis=1)
        # fill null comment text
        FoldTrain[COMMENT].fillna("unknown", inplace= True)
        #
        TFIDF = feature_extraction.text.TfidfVectorizer(ngram_range= (1,2), 
                              tokenizer= lambda x: x.split(' '), 
                              min_df= 20, 
                              max_df= 0.8, 
                              strip_accents='unicode',
                              max_features = 5000,
                              use_idf= 1, 
                              smooth_idf= 1, 
                              sublinear_tf= 1)
        TFIDF.fit(FoldTrain[COMMENT])
        feats = TFIDF.get_feature_names()
        feat_cols = [str(i) for i in range(len(feats))]
        # for valid
        tmpdf = pd.DataFrame(data= TFIDF.transform(FoldValid[COMMENT]).todense(), index= FoldValid.index, columns = feat_cols)
        FoldValid.drop([COMMENT], axis= 1, inplace= True)
        FoldValid = pd.concat([FoldValid, tmpdf], axis= 1)
        # for holdout
        tmpdf = pd.DataFrame(data= TFIDF.transform(FoldHoldout[COMMENT]).todense(), index= FoldHoldout.index, columns = feat_cols)
        FoldHoldout.drop([COMMENT], axis= 1, inplace= True)
        FoldHoldout = pd.concat([FoldHoldout, tmpdf], axis= 1)
        # for test
        tmpdf = pd.DataFrame(data= TFIDF.transform(FoldTest[COMMENT]).todense(), index= FoldTest.index, columns = feat_cols)
        FoldTest.drop([COMMENT], axis= 1, inplace= True)
        FoldTest = pd.concat([FoldTest, tmpdf], axis= 1)
        end = time.time()
        print('TFIDF transform done, time elapsed %.2fs.' % (end - start))
        ## save
        FoldOutputDir = '%s/kfold/%s/tfidf' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        FoldValid[:5].to_csv('%s/valid.csv' % FoldOutputDir, index= False)
        FoldTest[:5].to_csv('%s/holdout.csv' % FoldOutputDir, index= False)
        FoldTest[:5].to_csv('%s/test.csv' % FoldOutputDir, index= False)
        end = time.time()
        print('save demo for fold %s done, time elapsed %.2fs.' % (fold, (end - start)))
        DataUtil2.save(FoldValid, '%s/valid.%s' % (FoldOutputDir, 'hdf'), 'hdf')
        DataUtil2.save(FoldHoldout, '%s/holdout.%s' % (FoldOutputDir, 'hdf'), 'hdf')
        DataUtil2.save(FoldTest, '%s/test.%s' % (FoldOutputDir, 'hdf'), 'hdf')
        ## save feature index
        featdf = pd.DataFrame(index= range(len(feat_cols)))
        featdf['tfidf'] = feats
        featdf.reset_index(drop= False, inplace= True)
        featdf.to_csv('%s/feat.csv' % FoldOutputDir, index= False)
        end = time.time()
        print('fold %s done, time elapsed %.2fs.' % (fold, (end - start)))
    print('boostrap %s done' % i)
    break

sampled data rate 8626/9585
LOO data rate 845/8626
train/test = 6224/1557
valid data rate 1557/7781
TFIDF transform done, time elapsed 2.90s.
save demo for fold 0 done, time elapsed 2.92s.
fold 0 done, time elapsed 3.83s.
train/test = 6225/1556
valid data rate 1556/7781
TFIDF transform done, time elapsed 6.78s.
save demo for fold 1 done, time elapsed 6.80s.
fold 1 done, time elapsed 7.68s.
train/test = 6225/1556
valid data rate 1556/7781
TFIDF transform done, time elapsed 10.53s.
save demo for fold 2 done, time elapsed 10.55s.
fold 2 done, time elapsed 11.50s.
train/test = 6225/1556
valid data rate 1556/7781
TFIDF transform done, time elapsed 14.31s.
save demo for fold 3 done, time elapsed 14.33s.
fold 3 done, time elapsed 15.22s.
train/test = 6225/1556
valid data rate 1556/7781
TFIDF transform done, time elapsed 18.01s.
save demo for fold 4 done, time elapsed 18.04s.
fold 4 done, time elapsed 18.94s.
boostrap 0 done
