In [53]:
import json
import pandas as pd
import dill as pickle
import scipy.sparse

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
#         elif(format == 'csr'):
#             loader = np.load(file)
#             data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
#         elif(format == 'csr'):
#             np.savez(file, data= data['data'], indices= data['indices'], indptr= data['indptr'], shape= data['shape'])
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

In [54]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [55]:
def get_mdl(x, y):
    y = y.values
    p_1 = x[y == 1].sum(0)
    pr_1 = (p_1 + 1) / ((y == 1).sum() + 1)
    p_0 = x[y == 0].sum(0)
    pr_0 = (p_0 + 1) / ((y == 0).sum() + 1)
    r = np.log(pr_1 / pr_0)
    m = LogisticRegression(C= 40, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

def ComputeAUC(truth, predict):
    ''''''
    n = len(truth)
    #
    pos_num = np.sum(truth)
    neg_num = len(truth) - pos_num
    #
    pairs = zip(truth, predict)
    sorted_pairs = sorted(pairs, key= lambda x: x[1])
    sorted_truth = [s[0] for s in sorted_pairs]
    #
    auc = 0.0
    x = np.zeros((n), dtype= 'float')
    y = np.zeros((n), dtype= 'float')
    x[0] = 1.0
    y[0] = 1.0
    for i in range(1, n):
        a = (n - i - np.sum(sorted_truth[i:n]))/neg_num
        b = np.sum(sorted_truth[i:n])/pos_num
        x[i] = a
        y[i] = b
        #print(auc)
        auc += ((y[i] + y[i - 1]) * (x[i - 1] - x[i]))/2.0
    auc += (y[n - 1] * x[n - 1])/2.0
    
    return auc

In [56]:
import gc
import time, os, sys
from sklearn.metrics import roc_auc_score
import numpy as np
import datetime

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

DataBase = '../data'
R = 3
K = 5
iformat = 'npz'
oformat = 'csv'
algo = 'nbsvm'
strategy = 'tfidf'
level = 0
ilevel = 'l%s' % level
olevel = 'l%s' % (level + 1)
start = time.time()
for bid in range(R):
    BootstrapInputDir = '%s/bootstrap/%s' % (DataBase, bid)
    # load data
    valid_feats = []
    holdout_feats = []
    test_feats = []
    valid_xy = []
    holdout_xy = []
    test_xy = []
    for fold in range(K):
        FoldInputDir = '%s/%s/kfold/%s' % (BootstrapInputDir, ilevel, fold)
        TFIDFInputDir = '%s/%s' % (FoldInputDir, strategy)
        # for features
        valid = DataUtil2.load('%s/valid_feats.%s' % (TFIDFInputDir, iformat), iformat)
        valid_feats.append(valid)
        holdout = DataUtil2.load('%s/holdout_feats.%s' % (TFIDFInputDir, iformat), iformat)
        holdout_feats.append(holdout)
        test = DataUtil2.load('%s/test_feats.%s' % (TFIDFInputDir, iformat), iformat)
        test_feats.append(test)
        end = time.time()
        #print('loading features for fold %s done. time elapsed %.2fs' % (fold, (end - start)))
        # for xy
        valid = DataUtil2.load('%s/valid_xy.%s' % (TFIDFInputDir, 'csv'), 'csv')
        valid['fold'] = fold
        valid_xy.append(valid)
        holdout = DataUtil2.load('%s/holdout_xy.%s' % (TFIDFInputDir, 'csv'), 'csv')
        holdout_xy.append(holdout)
        test = DataUtil2.load('%s/test_xy.%s' % (TFIDFInputDir, 'csv'), 'csv')
        test_xy.append(test)
        end = time.time()
        #print('loading xy for fold %s done. time elapsed %.2fs' % (fold, (end - start)))
    valid_xy_df = pd.concat(valid_xy, axis= 0, ignore_index= True)
    del valid_xy
    gc.collect()
    print('load data done.')
    # scores for evaluation
    cv_score = .0
    holdout_score = .0
    y_test_pred = pd.DataFrame(index= range(len(test_xy[0])))
    cv_auc_score = {}
    holdout_auc_score = {}
    for l in label_cols:
        cv_auc_score[l] = .0
        holdout_auc_score[l] = .0
        y_test_pred[l] = .0
    ## training
    for fold in range(K):
        #
        FoldXData = {
            'valid': valid_feats[fold],
            'holdout': holdout_feats[fold],
            'test': test_feats[fold]
        }
        FoldXData['train'] = sparse.vstack([valid_feats[i] for i in range(K) if(i != fold)], format= 'csr')
#         print('train/valid = %s/%s' % (FoldXData['train'].shape[0], FoldXData['valid'].shape[0]))
        #
        FoldYData = {
            'train': valid_xy_df[valid_xy_df['fold'] != fold],
            'valid': valid_xy_df[valid_xy_df['fold'] == fold],
            'holdout': holdout_xy[fold],
            'test': test_xy[fold]
        }
        print('train/valid = %s/%s' % (len(FoldYData['train']), len(FoldYData['valid'])))
        targets = []
        cv_logerror = 0.0
        holdout_logerror = 0.0
        for i in range(len(label_cols)):
            # train
            m, r = get_mdl(FoldXData['train'], FoldYData['train'][label_cols[i]])
#             print('fitting for %s done.' % label_cols[i])
            target = '%s_%s_%s' % (algo, strategy, label_cols[i])
            targets.append(target)
            # for valid
            FoldYData['valid'][target] = m.predict_proba(FoldXData['valid'].multiply(r))[:,1]
            cv_logerror += -np.sum(np.log(FoldYData['valid'][target]) * FoldYData['valid'][label_cols[i]])
            cv_auc_score[label_cols[i]] += roc_auc_score(FoldYData['valid'][label_cols[i]], FoldYData['valid'][target])
            
            # for valid
            FoldYData['holdout'][target] = m.predict_proba(FoldXData['holdout'].multiply(r))[:,1]
            holdout_logerror += -np.sum(np.log(FoldYData['holdout'][target]) * FoldYData['holdout'][label_cols[i]])
            holdout_auc_score[label_cols[i]] += roc_auc_score(FoldYData['holdout'][label_cols[i]], FoldYData['holdout'][target])

            # for test
            FoldYData['test'][target] = m.predict_proba(FoldXData['test'].multiply(r))[:,1]
            y_test_pred[label_cols[i]] += FoldYData['test'][target]
            
        print('training for fold %s done.' % fold)
        # evaluation
        cv_logerror /= (len(FoldYData['valid']) * len(label_cols))
        cv_score += cv_logerror
        holdout_logerror /= (len(FoldYData['holdout']) * len(label_cols))
        holdout_score += holdout_logerror
        # save
        FoldOutputDir = '%s/%s/kfold/%s' % (BootstrapInputDir, olevel, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in FoldYData.keys():
            if(mod == 'train'):
                continue
            OutputFile = '%s/%s_%s_%s.%s' % (FoldOutputDir, mod, algo, strategy, oformat)
            DataUtil2.save(FoldYData[mod][targets], OutputFile, oformat)
        print('saving for fold %s done. cv score %.4f, holdout score %.4f' % (fold, cv_logerror, holdout_logerror))
    cv_score /= K
    holdout_score /= K
    for l in label_cols:
        cv_auc_score[l] /= K
        holdout_auc_score[l] /= K
        y_test_pred[l] /= K
        
    # Create submission file
    sub = pd.DataFrame()
    sub['id'] = test_xy[0]['id']
    for l in label_cols:
        sub[l] = y_test_pred[l]
    OutputFileName = '%s_%s_submit_%s' % (algo, strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
    SubmitDir = '%s/%s/submit' % (BootstrapInputDir, olevel)
    if(os.path.exists(SubmitDir) == False):
        os.makedirs(SubmitDir)
    sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
    print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
    os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
    
    print('==========================================================================')
    print('bootrap %s done!' % bid)
    print('cv score %.3f, cv auc score %s' % (cv_score, cv_auc_score))
    print('holdout score %.3f, holdout auc score %s' % (holdout_score, holdout_auc_score))
    print('==========================================================================')
    break

load data done.
train/valid = 65542/16386


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


training for fold 0 done.
saving for fold 0 done. cv score 0.1219, holdout score 0.1354
train/valid = 65542/16386
training for fold 1 done.
saving for fold 1 done. cv score 0.1345, holdout score 0.1442
train/valid = 65542/16386
training for fold 2 done.
saving for fold 2 done. cv score 0.1394, holdout score 0.1514
train/valid = 65543/16385
training for fold 3 done.
saving for fold 3 done. cv score 0.1351, holdout score 0.1448
train/valid = 65543/16385
training for fold 4 done.
saving for fold 4 done. cv score 0.1104, holdout score 0.1212
zip ../data/bootstrap/0/l1/submit/nbsvm_tfidf_submit_2018-01-01.zip ../data/bootstrap/0/l1/submit/nbsvm_tfidf_submit_2018-01-01.csv
bootrap 0 done!
cv score 0.128, cv auc score {'toxic': 0.71710678951681672, 'severe_toxic': 0.74179767777877748, 'obscene': 0.72052372893970651, 'threat': 0.64503414865299791, 'insult': 0.71807572980270229, 'identity_hate': 0.67398329690005776}
holdout score 0.139, holdout auc score {'toxic': 0.71775940863658305, 'severe_t