In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics

In [2]:
random_state = 42
np.random.seed(random_state)

In [3]:
def gen_fake_norm_dateset(column_size=20, instance_size=100000):
    """
    Input size: total batch size
    Distribution: gen a fake dataset for test, 20 coloumns is normal distributaion.
    """
    dataset = {}
    for i in range(column_size):
        dataset['col_{}'.format(i)] = np.random.normal(0,1,instance_size)
    df = pd.DataFrame(dataset)
    train = df[:instance_size//2]
    test = df[instance_size//2:]
    # add drift to column 0
    test['col_0'] += np.random.normal(0.1,0.5,len(test))
    return train, test

In [4]:
batch1, batch2 = gen_fake_norm_dateset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [5]:
def train_test_split(X, y, test_size, random_state=2018):
    """
    split data to train and test
    """
    sss = list(StratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state).split(X, y))
    X_train = np.take(X, sss[0][0], axis=0)
    X_test = np.take(X, sss[0][1], axis=0)
    y_train = np.take(y, sss[0][0], axis=0)
    y_test = np.take(y, sss[0][1], axis=0)
    return [X_train, X_test, y_train, y_test]

In [13]:
def get_fea_importance(clf, feature_name):
    """
    get feature importance from lightGBM
    """
    gain = clf.feature_importance('gain')
    importance_df = pd.DataFrame({
        'feature':clf.feature_name(),
        'split': clf.feature_importance('split'),
        'gain': gain, # * gain / gain.sum(),
        'gain_percent':100 *gain / gain.sum(),
        }).sort_values('gain',ascending=False)
    return importance_df

In [14]:
def adversial_validation(batch1, batch2):
    """
    split two batch to get importance
    """
    feature_name = list(batch1.columns)
    train_X = batch1
    train_Y = np.ones(train_X.shape[0])
    test_X = batch2
    test_Y = np.zeros(test_X.shape[0])
    X = np.concatenate((train_X.values,test_X.values),axis=0)
    y = np.concatenate((train_Y,test_Y),axis=0)
    test_size = int(len(X)/5) 
    X, X_test, y, y_test = train_test_split(X, y, test_size, random_state = 42)
    para = {
        'num_leaves': 6,
        'learning_rate': 0.1,
        'bagging_fraction': 0.2, 
        'feature_fraction': 0.5,
        'max_depth': 3, 
        "objective": "binary", 
        "metric":"auc", 
        'verbose': -1, 
        "seed": 42, 
        'num_threads': 8,
    }
    lgb_train = lgb.Dataset(X, y, free_raw_data=True)
    lgb_val = lgb.Dataset(X_test, y_test, free_raw_data=True, reference=lgb_train)
    lgb_model = lgb.train(para, lgb_train, valid_sets=lgb_val, valid_names='eval',feature_name=feature_name,
                                verbose_eval=False, early_stopping_rounds=10, num_boost_round=50)
    fpr, tpr, thresholds = metrics.roc_curve(
        y_test, lgb_model.predict(X_test, num_iteration = lgb_model.best_iteration))
    auc = metrics.auc(fpr, tpr)
    print("----Adversial Score is {}------".format(auc))
    fea_importance_adversial = get_fea_importance(lgb_model, feature_name)
    print(fea_importance_adversial.head(10))
    return fea_importance_adversial, auc

### get the batch split result, feature importance and auc

In [15]:
fea_imp, auc_true = adversial_validation(batch1, batch2)

----Adversial Score is 0.5430079299999999------
   feature         gain  gain_percent  split
0    col_0  1535.210706     82.929543     13
8   col_16    39.181170      2.116502      5
6   col_14    37.343861      2.017254      4
7   col_15    32.069911      1.732364      3
19   col_9    29.686660      1.603624      3
4   col_12    27.492129      1.485079      3
13   col_3    24.361560      1.315971      3
17   col_7    23.504041      1.269649      2
18   col_8    22.153960      1.196720      3
3   col_11    20.220530      1.092280      3


### Estimate the threshold. We could run more to get a distribution 

In [17]:
estimate_thres_auc = []
estimate_thres_gain = []
for i in range(5):
    len_batch1 = len(batch1) 
    base_df = batch1.append(batch2).reset_index(drop = False).sample(frac=1)
    fea_base, auc_base = adversial_validation(base_df[:len_batch1], base_df[len_batch1:])
    estimate_thres_auc.append(auc_base)
    estimate_thres_gain.append(fea_base['gain'].values[0])

----Adversial Score is 0.5032942699999999------
   feature       gain  gain_percent  split
6   col_13  28.866321     56.871471      3
4   col_11  11.377600     22.415771      1
1    col_0  10.513200     20.712759      1
0    index   0.000000      0.000000      0
12  col_19   0.000000      0.000000      0
19   col_8   0.000000      0.000000      0
18   col_7   0.000000      0.000000      0
17   col_6   0.000000      0.000000      0
16   col_5   0.000000      0.000000      0
15   col_4   0.000000      0.000000      0
----Adversial Score is 0.49869012499999993------
   feature       gain  gain_percent  split
14   col_3  27.136200     29.117084      2
7   col_14  15.723741     16.871540      2
12  col_19  11.812400     12.674680      1
11  col_18   8.082130      8.672109      1
13   col_2   8.061890      8.650390      1
8   col_15   7.570190      8.122798      1
15   col_4   7.545360      8.096155      1
1    col_0   7.264920      7.795244      1
0    index   0.000000      0.000000      0


In [18]:
#auc threashold
np.mean(estimate_thres_auc)

0.5026494249999999

In [19]:
# drift threashold
np.mean(estimate_thres_gain)

23.750864219665527