In [26]:
from numba import jit

import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

In [27]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
df_test = pd.read_csv('data/test.csv')

In [12]:
# @jit
def augment(train, num_n=1, num_p=2):
    newtrain = [train]
    
    n = train[train.target == 0]
    for i in range(num_n):
        newtrain.append(n.apply(lambda x: x.values.take(np.random.permutation(len(n)))))
    
    for i in range(num_p):
        p = train[train.target > 0]
        newtrain.append(p.apply(lambda x: x.values.take(np.random.permutation(len(p)))))
    return pd.concat(newtrain)

In [13]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [19]:
X_test = df_test.drop("ID_code",axis=1).values
result = np.zeros(X_test.shape[0])
val_aucs = []


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
for counter,(train_index, valid_index) in enumerate(skf.split(df_train, df_train.target),1):
    print (counter)
    
    #Train data
    X_train = df_train.iloc[train_index]
    X_train = augment(X_train)
    train_dataset = lgb.Dataset(X_train.drop("target",axis=1), label=X_train.target)
    
    #Validation data
    X_val = df_train.iloc[valid_index]
    val_dataset = lgb.Dataset(X_val.drop("target",axis=1), label=X_val.target)
    
    #Training
    model = lgb.train(param, train_dataset, 1000, valid_sets=[val_dataset], verbose_eval=50)
    
    p_valid = model.predict(X_val.drop("target",axis=1))
    val_score = roc_auc_score(X_val.target, p_valid)
    val_aucs.append(val_score)
    
    result += model.predict(X_test)


1
[50]	valid_0's auc: 0.8293
[100]	valid_0's auc: 0.849992
[150]	valid_0's auc: 0.859488
[200]	valid_0's auc: 0.864044
[250]	valid_0's auc: 0.864273
[300]	valid_0's auc: 0.865931
[350]	valid_0's auc: 0.868991
[400]	valid_0's auc: 0.870172
[450]	valid_0's auc: 0.871215
[500]	valid_0's auc: 0.872231
[550]	valid_0's auc: 0.873358
[600]	valid_0's auc: 0.874269
[650]	valid_0's auc: 0.875194
[700]	valid_0's auc: 0.87542
[750]	valid_0's auc: 0.876655
[800]	valid_0's auc: 0.877433
[850]	valid_0's auc: 0.878334
[900]	valid_0's auc: 0.879377
[950]	valid_0's auc: 0.880116
[1000]	valid_0's auc: 0.880744
2
[50]	valid_0's auc: 0.833619
[100]	valid_0's auc: 0.852693
[150]	valid_0's auc: 0.861595
[200]	valid_0's auc: 0.866189
[250]	valid_0's auc: 0.867299
[300]	valid_0's auc: 0.868546
[350]	valid_0's auc: 0.872549
[400]	valid_0's auc: 0.873376
[450]	valid_0's auc: 0.874677
[500]	valid_0's auc: 0.875703
[550]	valid_0's auc: 0.87664
[600]	valid_0's auc: 0.877456
[650]	valid_0's auc: 0.87843
[700]	valid_

In [22]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
print('Mean auc: %.9f, std: %.9f' % (mean_auc, std_auc))

Mean auc: 0.880620958, std: 0.003518397


LB score is 0.880

In [25]:
sub_df = pd.DataFrame({'ID_code': df_test['ID_code'].values})
sub_df['target'] = result / counter
sub_df.to_csv('submission.csv', index=False)