In [13]:
import pandas as pd
import numpy as np
from contextlib import contextmanager
import os,sys,datetime, time
from sklearn.metrics import roc_auc_score
from sklearn import linear_model
from sklearn import svm

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

kfold = 4
DataBaseDir = '../../data/version2'
InputDir = '%s/l1' % DataBaseDir
OutputDir = '%s/l2' % DataBaseDir
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
strategy = 'l1-lr'
meta_models = ['nbsvm', 'bi-gru', 'lr', 'fm-ftrl','bi-gru-glove', 'bi-gru-cnn', 'lgb']
drop_cols = ['id', 'fold']

# load data
valid_dfs = []
test_dfs = []
with timer('load data'):
    for fold in range(kfold):
        FoldInputDir = '%s/kfold/%s' % (InputDir, fold)
        for i in range(len(meta_models)):
            valid = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, meta_models[i])).reset_index(drop= True)
            test = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, meta_models[i])).reset_index(drop= True)
            if(i == 0):
                FoldValid = valid
                FoldTest = test
            else:
                for t in targets:
                    target = '%s_%s' % (meta_models[i], t)
                    FoldValid[target] = valid[target].copy()
                    FoldTest[target] = test[target].copy()
        FoldValid['fold'] = fold
        valid_dfs.append(FoldValid)
        test_dfs.append(FoldTest)
        print('fold %s done.' % fold)
    TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
    print('load data done, train %s' % (len(TrainData)))

# print(TrainData.columns)
## CV score
cv_score = .0
# predict
y_test_pred = 0
pred_targets = ['%s_%s' % (strategy, t) for t in targets]
TestData = test_dfs[0][['id']].copy()
for t in pred_targets:
    TestData[t] = .0
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'test': test_dfs[fold].copy()
    }
    for t in targets:
        FoldData['valid']['%s_%s' % (strategy, t)] = .0
        FoldData['test']['%s_%s' % (strategy, t)] = .0
    for target in targets:
        with timer('train for %s' % target):
            meta_feats = ['%s_%s' % (c, target) for c in meta_models]
            # train
            model = linear_model.LogisticRegression(C= 0.00000004, 
                                                    max_iter= 100, 
                                                    tol= 1e-6, 
                                                    class_weight= 'balanced')#, warm_start= True)
#             model = svm.SVC(C= 100, class_weight= 'balanced', max_iter= 10, probability= True)
            model.fit(FoldData['train'][meta_feats], FoldData['train'][target])
            # for valid
            pred_target = '%s_%s' % (strategy, target)
            FoldData['valid'][pred_target] = model.predict_proba(FoldData['valid'][meta_feats])[:,1]
            # for test
            pred_test = model.predict_proba(FoldData['test'][meta_feats])[:,1]
            FoldData['test'][pred_target] = pred_test
            TestData[pred_target] += pred_test
    
    score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_targets])
    cv_score += score
    
    ## output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in ['valid', 'test']:
        if(mod == 'test'):
            out_cols = ['id']
            out_cols.extend(pred_targets)
        else:
            out_cols = pred_targets.copy()
            out_cols.extend(targets)
        FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
    print('fold %s, score %.5f' % (fold, score))

for t in pred_targets:
    TestData[t] /= kfold
cv_score /= kfold # Average valid set predictions
print('\n================')
print('cv score %.5f' % (cv_score))
print('================')

## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_targets]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/submit' % InputDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

fold 0 done.
fold 1 done.
fold 2 done.
fold 3 done.
load data done, train 159571
[load data] done in 9 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[train for toxic] done in 0 s
[train for severe_toxic] done in 0 s
[train for obscene] done in 0 s
[train for threat] done in 0 s
[train for insult] done in 0 s
[train for identity_hate] done in 0 s
fold 0, score 0.99082
[train for toxic] done in 0 s
[train for severe_toxic] done in 0 s
[train for obscene] done in 0 s
[train for threat] done in 0 s
[train for insult] done in 0 s
[train for identity_hate] done in 0 s
fold 1, score 0.99052
[train for toxic] done in 0 s
[train for severe_toxic] done in 0 s
[train for obscene] done in 0 s
[train for threat] done in 0 s
[train for insult] done in 0 s
[train for identity_hate] done in 0 s
fold 2, score 0.99014
[train for toxic] done in 0 s
[train for severe_toxic] done in 0 s
[train for obscene] done in 0 s
[train for threat] done in 0 s
[train for insult] done in 0 s
[train for identity_hate] done in 0 s
fold 3, score 0.99134

cv score 0.99071
zip ../../data/version2/l1/submit/l1-lr_submit_2018-03-11.zip ../../data/version2/l1/submit/l1-lr_

0