In [1]:
from numba import jit

import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [9]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
df_test = pd.read_csv('data/test.csv')

In [10]:
df_train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [11]:
df_train.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [12]:
# @jit
def augment(train, num_n=1, num_p=2):
    newtrain = [train]
    
    n = train[train.target == 0]
    for i in range(num_n):
        newtrain.append(n.apply(lambda x: x.values.take(np.random.permutation(len(n)))))
    
    for i in range(num_p):
        p = train[train.target > 0]
        newtrain.append(p.apply(lambda x: x.values.take(np.random.permutation(len(p)))))
    return pd.concat(newtrain)

In [13]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [57]:
X_train = df_train.copy()
X_train.drop('target', axis=1, inplace=True)

In [58]:
a = X_train.iloc[0]
print(a.abs().sum() - a.sum())

430.28959999999984


In [59]:
X_train['new_f'] = X_train.abs().sum(axis=1) - X_train.sum(axis=1)

In [61]:
X_test = df_test.drop("ID_code",axis=1).values
result = np.zeros(X_test.shape[0])
val_aucs = []


skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=random_state)
for counter,(train_index, valid_index) in enumerate(skf.split(df_train, df_train.target),1):
    print (counter)
    
    #Train data
    X_train = df_train.iloc[train_index]
    X_train = augment(X_train)
    train_dataset = lgb.Dataset(X_train.drop("target",axis=1), label=X_train.target)
    
    #Validation data
    X_val = df_train.iloc[valid_index]
    val_dataset = lgb.Dataset(X_val.drop("target",axis=1), label=X_val.target)
    
    #Training
    model = lgb.train(param, train_dataset, 1000, valid_sets=[val_dataset], verbose_eval=50)
    
    p_valid = model.predict(X_val.drop("target",axis=1))
    val_score = roc_auc_score(X_val.target, p_valid)
    val_aucs.append(val_score)
    
    result += model.predict(X_test)


1
[50]	valid_0's auc: 0.833533
[100]	valid_0's auc: 0.854914
[150]	valid_0's auc: 0.862656
[200]	valid_0's auc: 0.867639
[250]	valid_0's auc: 0.868043
[300]	valid_0's auc: 0.868612
[350]	valid_0's auc: 0.871151
[400]	valid_0's auc: 0.871721
[450]	valid_0's auc: 0.872754
[500]	valid_0's auc: 0.873783
[550]	valid_0's auc: 0.874598
[600]	valid_0's auc: 0.875344
[650]	valid_0's auc: 0.876206
[700]	valid_0's auc: 0.876553
[750]	valid_0's auc: 0.8776
[800]	valid_0's auc: 0.878474
[850]	valid_0's auc: 0.879233
[900]	valid_0's auc: 0.880406
[950]	valid_0's auc: 0.881005
[1000]	valid_0's auc: 0.881774
2
[50]	valid_0's auc: 0.827539
[100]	valid_0's auc: 0.847607
[150]	valid_0's auc: 0.858412
[200]	valid_0's auc: 0.861388
[250]	valid_0's auc: 0.862987
[300]	valid_0's auc: 0.863634
[350]	valid_0's auc: 0.868018
[400]	valid_0's auc: 0.869378
[450]	valid_0's auc: 0.870709
[500]	valid_0's auc: 0.871943
[550]	valid_0's auc: 0.872826
[600]	valid_0's auc: 0.873531
[650]	valid_0's auc: 0.874233
[700]	val

In [63]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
print('Mean auc: %.9f, std: %.9f' % (mean_auc, std_auc))

Mean auc: 0.880451702, std: 0.005131516


LB score is 0.880

Mean auc: 0.880620958, std: 0.003518397

In [64]:
sub_df = pd.DataFrame({'ID_code': df_test['ID_code'].values})
sub_df['target'] = result / counter
sub_df.to_csv('submission.csv', index=False)