In [22]:
import json
import pandas as pd
import dill as pickle
import scipy.sparse

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
#         elif(format == 'csr'):
#             loader = np.load(file)
#             data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
#         elif(format == 'csr'):
#             np.savez(file, data= data['data'], indices= data['indices'], indptr= data['indptr'], shape= data['shape'])
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

In [23]:
# load data
iformat = 'hdf'
DataBase = '../data'
DataSet = {}
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/bow/%s.%s' % (DataBase, mod, iformat), iformat)
print('load data done.')

load data done.


In [24]:
from sklearn import model_selection
import numpy as np
import sys,os,time,gc
from scipy import sparse

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


# np.random.seed(2017)
# DataSet['train'] = DataSet['train'].sample(frac= 0.2)
# DataSet['test'] = DataSet['test'].sample(frac= 0.2)
## resampling bootstrap
R = 3 # bootstrap number
RRate = 0.95 # resampling rate
HRate = 0.1 # holdout rate
K = 5 # folds number
COMMENT = 'comment_text'
TrainData = DataSet['train']
start = time.time()
for i in range(R):
    BoostrapOutputDir = '%s/bootstrap/%s' % (DataBase, i)
    ## sampling
    ReSampledTrainData = TrainData.sample(frac= RRate, random_state= 2017 + i)
    print('sampled data rate %s/%s' % (len(ReSampledTrainData), len(TrainData)))
    ## leave-one-out(holdout) strategy for local validation
    np.random.seed(2017 + i)
    msk = np.random.rand(len(ReSampledTrainData)) < HRate
    holdout = ReSampledTrainData[msk]
    train = ReSampledTrainData[~msk]
    test = DataSet['test']
    print('LOO data rate %s/%s' % (len(holdout), len(ReSampledTrainData)))
    ## split int x and y
    y_cols = ['id']
    y_cols.extend(label_cols)
    train_x = train[COMMENT]
    train_y = train[y_cols]
    holdout_x = holdout[COMMENT]
    holdout_y = holdout[y_cols]
    test_x = DataSet['test'][COMMENT]
    test_y = DataSet['test'][y_cols]
    del train, holdout, test
    gc.collect()

    ## k-fold
    kf = model_selection.KFold(n_splits = K, random_state = 2017 + i, shuffle = True)
    OutputDir = '%s/l0' % BoostrapOutputDir
    if(os.path.exists('%s/kfold' % OutputDir) == False):
        os.makedirs('%s/kfold' % OutputDir)

    ## tfidf transformation
    TFIDF = feature_extraction.text.TfidfVectorizer(ngram_range= (1,2), 
                              tokenizer= lambda x: x.split(' '), 
                              min_df= 20, 
                              max_df= 0.8, 
                              strip_accents='unicode',
                              max_features = 5000,
                              use_idf= 1, 
                              smooth_idf= 1, 
                              sublinear_tf= 1)
    TFIDF.fit(train_x)
    feats = TFIDF.get_feature_names()
    feat_cols = [str(i) for i in range(len(feats))]
    end = time.time()
    print('fitting for tfidf done, time elapsed %.2f' % (end - start))

    holdout_x_tfidf = TFIDF.transform(holdout_x)
    test_x_tfidf = TFIDF.transform(test_x)
    # save for holdout and test
    for fold in range(K):
        FoldOutputDir = '%s/kfold/%s/tfidf' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        DataUtil2.save(holdout_x_tfidf, '%s/holdout_x_tfidf.%s' % (FoldOutputDir, 'npz'), 'npz')
        DataUtil2.save(holdout_y, '%s/holdout_y.%s' % (FoldOutputDir, 'csv'), 'csv')
        DataUtil2.save(test_x_tfidf, '%s/test_x_tfidf.%s' % (FoldOutputDir, 'npz'), 'npz')
        DataUtil2.save(test_y, '%s/test_y.%s' % (FoldOutputDir, 'csv'), 'csv')
        end = time.time()
        print('saving for fold %s of holdout/test done, time elapsed %.2fs' % ( fold, (end - start)))
    del holdout_x_tfidf, test_x_tfidf
    gc.collect()
    end = time.time()
    print('save holdout/test done, time elapsed %.2fs' % (end - start))
    
    train_x_tfidf = TFIDF.transform(train_x)
    # save for valid
    for fold, (train_index, test_index) in enumerate(kf.split(train_x_tfidf)):
        valid_x_tfidf = sparse.csr_matrix(train_x_tfidf.toarray()[test_index,:])
        valid_y = train_y.iloc[test_index]
        FoldOutputDir = '%s/kfold/%s/tfidf' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        DataUtil2.save(valid_x_tfidf, '%s/valid_x_tfidf.%s' % (FoldOutputDir, 'npz'), 'npz')
        DataUtil2.save(valid_y, '%s/valid_y.%s' % (FoldOutputDir, 'csv'), 'csv')
        end = time.time()
        print('saving fold %s of valid done, time elapsed %.2fs' % (fold, (end - start)))
        
    end = time.time()
    print('save valid done, time elapsed %.2f' % (end - start))
    
    print('boostrap %s done' % i)
    break

sampled data rate 91058/95851
LOO data rate 9130/91058
fitting for tfidf done, time elapsed 10.21
saving for fold 0 of holdout/test done, time elapsed 32.30s
saving for fold 1 of holdout/test done, time elapsed 41.79s
saving for fold 2 of holdout/test done, time elapsed 51.20s
saving for fold 3 of holdout/test done, time elapsed 60.64s
saving for fold 4 of holdout/test done, time elapsed 70.41s
save holdout/test done, time elapsed 70.64s
(16386, 5000)
saving fold 0 of valid done, time elapsed 77.27s
(16386, 5000)
saving fold 1 of valid done, time elapsed 80.12s
(16386, 5000)
saving fold 2 of valid done, time elapsed 83.02s
(16385, 5000)
saving fold 3 of valid done, time elapsed 85.91s
(16385, 5000)
saving fold 4 of valid done, time elapsed 88.96s
save valid done, time elapsed 88.97
boostrap 0 done
