In [None]:
##################################
#Forward Greedy Feature selection#
##################################

import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
from itertools import combinations
import math
from sklearn.metrics import roc_auc_score
from scipy.special import erfinv
from contextlib import contextmanager

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

DataBaseDir = '../../data/version2'
InputDir = '%s/l1/kfold' % DataBaseDir
# MetaInputDir = '%s/meta/kfold' % DataBaseDir
kfold = 4
seed_num = 1
verbose = True
has_snapshot = False
attention = 'meta'
datestr = '%s' % datetime.datetime.now().strftime("%Y-%m-%d")
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
strategy = 'lr'
meta_models = ['bi-gru', 'bi-lstm', 'bi-gru-cnn',
               'fm-ftrl', 'lgb', 'lr', 'nbsvm', 'bi-lstm-attention', 'bi-gru-capsule', 'bi-lstm-cnn-2']
drop_cols = ['id', 'fold']

num_cols = []
# load data
valid_dfs = []
with timer('load data'):
    for fold in range(kfold):
        FoldInputDir = '%s/%s' % (InputDir, fold)
        for i in range(len(meta_models)):
            valid = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, meta_models[i])).reset_index(drop= True)
            if(i == 0):
                FoldValid = valid
            else:
                for t in targets:
                    target = '%s_%s' % (meta_models[i], t)
                    FoldValid[target] = valid[target].copy()
            ##
        NumFoldInputDir = '%s/num/kfold/%s' % (DataBaseDir, fold)
        NumValid = pd.read_csv('%s/valid.csv' % NumFoldInputDir).reset_index(drop= True)
        TmpNumCols = []
        for c in NumValid.columns:
            tmp_drop_cols = drop_cols.copy()
            tmp_drop_cols.extend(targets)
            if(c not in tmp_drop_cols):
                FoldValid[c] = NumValid[c]
                TmpNumCols.append(c)
        num_cols = TmpNumCols
        FoldValid['fold'] = fold
        valid_dfs.append(FoldValid)
        print('fold %s done.' % fold)
    TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
    print('load data done, train %s' % (len(TrainData)))
print('==================================')
print('loading data done.')
print('==================================\n')

####
all_feats = []
#all_feats.extend(['meta_%s' % (c) for c in meta_models])
all_feats.extend(num_cols)

## GaussRank Normalization
for c in num_cols:
    rank = np.argsort(TrainData[c], axis= 0)
    upper = np.max(rank)
    lower = np.min(rank)
    # linear normalization to 0-1
    TrainData[c] = (TrainData[c] - lower)/(upper - lower)
    # gauss normalization
    TrainData[c] = erfinv(TrainData[c])
    TrainData[c] -= np.mean(TrainData[c])

pred_targets = ['%s_%s' % (strategy, t) for t in targets]
def EvaluateFeature(cur_feat, feats):
    cv_score = .0
    #with timer('evaluatation for %s' % cur_feat):
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold]
        }
        for target in targets:
            cols = []
            for c in feats:
                if(c.startswith('meta')):
                    cols.append('%s_%s' % (c[5:], target)) ## meta models
                else:
                    cols.append(c) ## other numeric features
            #cols = ['%s_%s' % (c, target) for c in feats ]
            model = linear_model.SGDClassifier(loss= 'log', 
                                                alpha= 0.004, 
                                                l1_ratio= 0.15, 
                                                max_iter= 5)
            model.fit(FoldData['train'][cols], FoldData['train'][target])
            # for valid
            pred_target = '%s_%s' % (strategy, target)
            FoldData['valid'][pred_target] = model.predict_proba(FoldData['valid'][cols])[:,1]
            # for test
            pred_valid = model.predict_proba(FoldData['valid'][cols])[:,1]
            FoldData['valid'][pred_target] = pred_valid
        score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_targets])
        cv_score += score
    return cv_score/kfold

print('====================================')
print('total features size %s, sample size %s' % (len(all_feats), len(TrainData)))
print('==================================\n')

start = time.time()
score_history = []
good_features = set([])
OutputDir = '%s/gfs/%s' % (DataBaseDir, attention)
if(os.path.exists(OutputDir) == False):
    os.makedirs(OutputDir)
if(has_snapshot):
    with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'r') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'r') as o_score:
        for line in o_feat:
            good_features.add(line.rstrip())
        for line in o_score:
            parts = line.rstrip().split(',')
            score_history.append((float(parts[0]), parts[1]))
    o_feat.close()
    o_score.close()
    print('loading good feature snapshot done.')
while ((len(score_history) < 2) or (score_history[-1][0] > score_history[-2][0])):
    scores = []
    for feature in all_feats:
        if feature not in good_features:
            selected_features = list(good_features) + [feature]
#             print('evaluation for %s begins...' % feature)
            score = EvaluateFeature(feature[5:], selected_features)
            scores.append((score, feature))
    if(len(scores) == 0):
        break
    selected = sorted(scores)[-1]
    current_feat = selected[1]
    current_score = selected[0]
    good_features.add(current_feat)
    score_history.append(selected)
    end = time.time()
    if verbose:
        improved_score = .0
        if(len(score_history) > 1):
            improved_score = score_history[-1][0] - score_history[-2][0]
        print('====================================')
        print('Current master %s, improve score %.5f, time elapsed %.2fs' % (current_feat, improved_score, (end - start)))
        print('====================================\n')
    with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'w') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'w') as o_score:
        for feat in good_features:
            o_feat.write('%s\n' % feat)
        for score, feat in score_history:
            o_score.write('%s,%s\n' % (str(score), feat))
    o_feat.close()
    o_score.close()

# Remove the last added feature if necessary
if(score_history[-1][0] < score_history[-2][0]):
    good_features.remove(score_history[-1][1])
good_features = sorted(list(good_features))
if verbose:
    print("Selected Features : ", good_features)
    
with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'w') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'w') as o_score:
    for feat in good_features:
        o_feat.write('%s\n' % feat)
    for score, feat in score_history:
        o_score.write('%s,%s\n' % (str(score), feat))
o_feat.close()
o_score.close()

fold 0 done.
fold 1 done.
fold 2 done.
fold 3 done.
load data done, train 159571
[load data] done in 4 s
loading data done.

total features size 44, sample size 159571



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
