In [1]:
from numba import jit

import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
df_test = pd.read_csv('data/test.csv')

In [3]:
df_train.shape, df_test.shape

((200000, 201), (200000, 201))

In [4]:
df_train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [5]:
df_train.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
# # @jit
# def augment(train, num_n=1, num_p=2):
#     newtrain = [train]
    
#     n = train[train.target == 0]
#     for i in range(num_n):
#         newtrain.append(n.apply(lambda x: x.values.take(np.random.permutation(len(n)))))
    
#     for i in range(num_p):
#         p = train[train.target > 0]
#         newtrain.append(p.apply(lambda x: x.values.take(np.random.permutation(len(p)))))
#     return pd.concat(newtrain)

In [7]:
# def shuffle_col_vals(x1):
#     rand_x = np.array([np.random.choice(x1.shape[0], size=x1.shape[0], replace=False) for i in range(x1.shape[1])]).T
#     grid = np.indices(x1.shape)
#     rand_y = grid[1]
#     return x1[(rand_x, rand_y)]

# @jit
# def augment(x,y,t=2):
#     xs,xn = [],[]
#     for i in range(t):
#         mask = y>0
#         x1 = x[mask].copy()
#         x1 = shuffle_col_vals(x1)
#         xs.append(x1)

#     for i in range(t//2):
#         mask = y==0
#         x1 = x[mask].copy()
#         x1 = shuffle_col_vals(x1)
#         xn.append(x1)

#     xs = np.vstack(xs); xn = np.vstack(xn)
#     ys = np.ones(xs.shape[0]);yn = np.zeros(xn.shape[0])
#     x = np.vstack([x,xs,xn]); y = np.concatenate([y,ys,yn])
#     return x,y

In [8]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [9]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [10]:
# X_train = df_train.copy()
# X_train.drop('target', axis=1, inplace=True)

In [11]:
# X_train['new_f'] = X_train.abs().sum(axis=1) - X_train.sum(axis=1)

In [12]:
cat_features = ['var_12','var_13','var_108','var_126','var_68']
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
X_test = df_test[features].values

In [13]:
def decode_cat_features(data, cat_features):
    for f in cat_features:
        hist, bin_edges = np.histogram(data[f].values, bins=1000, density=True)
        data['test_' + f] = [hist[np.searchsorted(bin_edges, elem) - 1] for elem in data[f].values]

In [14]:
result = np.zeros(X_test.shape[0])
val_aucs = []


skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=random_state)
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    print(fold)
    
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    X_t, y_t = augment(X_train.values, y_train.values)
    X_t = pd.DataFrame(X_t)
    X_t = X_t.add_prefix('var_')
    
#     decode_cat_features(X_t, cat_features)
#     decode_cat_features(X_valid, cat_features)
    #Train data
    train_dataset = lgb.Dataset(X_t, label=y_t, categorical_feature=cat_features)
    
    #Validation data
    val_dataset = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    
    #Training
    model = lgb.train(param, train_dataset, 1000, valid_sets=[val_dataset], verbose_eval=50)
    
    p_valid = model.predict(X_valid)
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    result += model.predict(X_test)


0




[50]	valid_0's auc: 0.821342
[100]	valid_0's auc: 0.854616
[150]	valid_0's auc: 0.862857
[200]	valid_0's auc: 0.865884
[250]	valid_0's auc: 0.870943
[300]	valid_0's auc: 0.87168
[350]	valid_0's auc: 0.872938
[400]	valid_0's auc: 0.874216
[450]	valid_0's auc: 0.876016
[500]	valid_0's auc: 0.87703
[550]	valid_0's auc: 0.878479
[600]	valid_0's auc: 0.879556
[650]	valid_0's auc: 0.880614
[700]	valid_0's auc: 0.881174
[750]	valid_0's auc: 0.881921
[800]	valid_0's auc: 0.882654
[850]	valid_0's auc: 0.883099
[900]	valid_0's auc: 0.883697
[950]	valid_0's auc: 0.884168
[1000]	valid_0's auc: 0.88452
1
[50]	valid_0's auc: 0.811002
[100]	valid_0's auc: 0.847466
[150]	valid_0's auc: 0.857514
[200]	valid_0's auc: 0.858922
[250]	valid_0's auc: 0.861787
[300]	valid_0's auc: 0.863036
[350]	valid_0's auc: 0.865033
[400]	valid_0's auc: 0.866733
[450]	valid_0's auc: 0.868926
[500]	valid_0's auc: 0.870228
[550]	valid_0's auc: 0.871735
[600]	valid_0's auc: 0.87319
[650]	valid_0's auc: 0.87416
[700]	valid_0'

In [16]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
print('Mean auc: %.9f, std: %.9f' % (mean_auc, std_auc))

Mean auc: 0.881015453, std: 0.004628135


best LB score of this model is 0.880

CV os this model Mean auc: 0.880620958, std: 0.003518397

In [10]:
sub_df = pd.DataFrame({'ID_code': df_test['ID_code'].values})
sub_df['target'] = result / counter
sub_df.to_csv('submission.csv', index=False)