This code was provided by [Giba](https://www.kaggle.com/titericz) who is one of top-3 (2/109643) kagglers.

[Giba Single Model Public 0.9245/ Private 0.9234 | Kaggle](https://www.kaggle.com/titericz/giba-single-model-public-0-9245-private-0-9234)

In [2]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

In [5]:
data_folder = 'data'

train_df, test_df = pd.read_csv(data_folder+'/train.csv'), pd.read_csv(data_folder+'/test.csv')

In [6]:
features = [x for x in train_df.columns if x.startswith("var")]

In [7]:
#Reverse features
for var in features:
    if np.corrcoef( train_df['target'], train_df[var] )[1][0] < 0:
        train_df[var] = train_df[var] * -1
        test_df[var]  = test_df[var]  * -1

In [8]:
#count all values
var_stats = {}
hist_df = pd.DataFrame()

for var in features:
    var_stats = train_df[var].append(test_df[var]).value_counts()
    hist_df[var] = pd.Series(test_df[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1

#remove fake test rows
ind = hist_df.sum(axis=1) != 200

In [9]:
#recount values without fake rows
var_stats = {}
for var in features:
    var_stats[var] = train_df[var].append(test_df[ind][var]).value_counts()

In [10]:
def logit(p):
    return np.log(p) - np.log(1 - p)

def var_to_feat(vr, var_stats, feat_id ):
    new_df = pd.DataFrame()
    new_df["var"] = vr.values
    new_df["hist"] = pd.Series(vr).map(var_stats)
    new_df["feature_id"] = feat_id
    new_df["var_rank"] = new_df["var"].rank()/200000.
    return new_df.values

In [11]:
TARGET = np.array( list(train_df['target'].values) * 200 )

TRAIN = []
var_mean = {}
var_var  = {}
for var in features:
    tmp = var_to_feat(train_df[var], var_stats[var], int(var[4:]) )
    var_mean[var] = np.mean(tmp[:,0]) 
    var_var[var]  = np.var(tmp[:,0])
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    TRAIN.append( tmp )
TRAIN = np.vstack( TRAIN )

del train_df
_=gc.collect()

print( TRAIN.shape, len( TARGET ) )

(40000000, 4) 40000000


In [13]:
model = lgb.LGBMClassifier(**{
     'learning_rate': 0.04,
     'num_leaves': 31,
     'max_bin': 1023,
     'min_child_samples': 1000,
     'reg_alpha': 0.1,
     'reg_lambda': 0.2,
     'feature_fraction': 1.0,
     'bagging_freq': 1,
     'bagging_fraction': 0.85,
     'objective': 'binary',
     'n_jobs': -1,
     'n_estimators':200,})

MODELS = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11111)
for fold_, (train_indexes, valid_indexes) in enumerate(skf.split(TRAIN, TARGET)):
    print('Fold:', fold_ )
    model = model.fit( TRAIN[train_indexes], TARGET[train_indexes],
                      eval_set = (TRAIN[valid_indexes], TARGET[valid_indexes]),
                      verbose = 10,
                      eval_metric='auc',
                      early_stopping_rounds=25,
                      categorical_feature = [2] )
    MODELS.append( model )

del TRAIN, TARGET
_=gc.collect()

Fold: 0


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325586	valid_0's auc: 0.528006
[20]	valid_0's binary_logloss: 0.325356	valid_0's auc: 0.528165
[30]	valid_0's binary_logloss: 0.325252	valid_0's auc: 0.528277
[40]	valid_0's binary_logloss: 0.325202	valid_0's auc: 0.528362
[50]	valid_0's binary_logloss: 0.325176	valid_0's auc: 0.528408
[60]	valid_0's binary_logloss: 0.325162	valid_0's auc: 0.528432
[70]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.528396
[80]	valid_0's binary_logloss: 0.325151	valid_0's auc: 0.528365
Early stopping, best iteration is:
[57]	valid_0's binary_logloss: 0.325165	valid_0's auc: 0.528438
Fold: 1


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325591	valid_0's auc: 0.526915
[20]	valid_0's binary_logloss: 0.325363	valid_0's auc: 0.527153
[30]	valid_0's binary_logloss: 0.325262	valid_0's auc: 0.527289
[40]	valid_0's binary_logloss: 0.325212	valid_0's auc: 0.527352
[50]	valid_0's binary_logloss: 0.325187	valid_0's auc: 0.527413
[60]	valid_0's binary_logloss: 0.325174	valid_0's auc: 0.527445
[70]	valid_0's binary_logloss: 0.325166	valid_0's auc: 0.527479
[80]	valid_0's binary_logloss: 0.325161	valid_0's auc: 0.527457
[90]	valid_0's binary_logloss: 0.325159	valid_0's auc: 0.527472
[100]	valid_0's binary_logloss: 0.325157	valid_0's auc: 0.527465
[110]	valid_0's binary_logloss: 0.325156	valid_0's auc: 0.527468
Early stopping, best iteration is:
[92]	valid_0's binary_logloss: 0.325158	valid_0's auc: 0.52748
Fold: 2


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325584	valid_0's auc: 0.528265
[20]	valid_0's binary_logloss: 0.325349	valid_0's auc: 0.528566
[30]	valid_0's binary_logloss: 0.325243	valid_0's auc: 0.528682
[40]	valid_0's binary_logloss: 0.325193	valid_0's auc: 0.528655
[50]	valid_0's binary_logloss: 0.325168	valid_0's auc: 0.528676
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.325243	valid_0's auc: 0.528682
Fold: 3


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325585	valid_0's auc: 0.527374
[20]	valid_0's binary_logloss: 0.325355	valid_0's auc: 0.527708
[30]	valid_0's binary_logloss: 0.325254	valid_0's auc: 0.52786
[40]	valid_0's binary_logloss: 0.325207	valid_0's auc: 0.527901
[50]	valid_0's binary_logloss: 0.325182	valid_0's auc: 0.52792
[60]	valid_0's binary_logloss: 0.325169	valid_0's auc: 0.527955
[70]	valid_0's binary_logloss: 0.325161	valid_0's auc: 0.527955
[80]	valid_0's binary_logloss: 0.325157	valid_0's auc: 0.527955
[90]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.527993
[100]	valid_0's binary_logloss: 0.325152	valid_0's auc: 0.528004
[110]	valid_0's binary_logloss: 0.325151	valid_0's auc: 0.528
[120]	valid_0's binary_logloss: 0.32515	valid_0's auc: 0.528001
[130]	valid_0's binary_logloss: 0.32515	valid_0's auc: 0.528009
[140]	valid_0's binary_logloss: 0.325149	valid_0's auc: 0.528014
[150]	valid_0's binary_logloss: 0.325149	val

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.32558	valid_0's auc: 0.528613
[20]	valid_0's binary_logloss: 0.325342	valid_0's auc: 0.528906
[30]	valid_0's binary_logloss: 0.325234	valid_0's auc: 0.529017
[40]	valid_0's binary_logloss: 0.325181	valid_0's auc: 0.529075
[50]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.529135
[60]	valid_0's binary_logloss: 0.32514	valid_0's auc: 0.529146
[70]	valid_0's binary_logloss: 0.325131	valid_0's auc: 0.529168
[80]	valid_0's binary_logloss: 0.325126	valid_0's auc: 0.529188
[90]	valid_0's binary_logloss: 0.325123	valid_0's auc: 0.529205
[100]	valid_0's binary_logloss: 0.325121	valid_0's auc: 0.529194
[110]	valid_0's binary_logloss: 0.32512	valid_0's auc: 0.529181
Early stopping, best iteration is:
[94]	valid_0's binary_logloss: 0.325122	valid_0's auc: 0.529214
Fold: 5


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325597	valid_0's auc: 0.528167
[20]	valid_0's binary_logloss: 0.325371	valid_0's auc: 0.528415
[30]	valid_0's binary_logloss: 0.325268	valid_0's auc: 0.528583
[40]	valid_0's binary_logloss: 0.32522	valid_0's auc: 0.528617
[50]	valid_0's binary_logloss: 0.325195	valid_0's auc: 0.528716
[60]	valid_0's binary_logloss: 0.325182	valid_0's auc: 0.528732
[70]	valid_0's binary_logloss: 0.325175	valid_0's auc: 0.528742
[80]	valid_0's binary_logloss: 0.32517	valid_0's auc: 0.528767
[90]	valid_0's binary_logloss: 0.325168	valid_0's auc: 0.528754
[100]	valid_0's binary_logloss: 0.325166	valid_0's auc: 0.528737
Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.325171	valid_0's auc: 0.528776
Fold: 6


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325577	valid_0's auc: 0.528617
[20]	valid_0's binary_logloss: 0.325339	valid_0's auc: 0.528893
[30]	valid_0's binary_logloss: 0.325232	valid_0's auc: 0.529009
[40]	valid_0's binary_logloss: 0.325179	valid_0's auc: 0.5291
[50]	valid_0's binary_logloss: 0.325153	valid_0's auc: 0.529101
[60]	valid_0's binary_logloss: 0.325138	valid_0's auc: 0.529077
[70]	valid_0's binary_logloss: 0.325129	valid_0's auc: 0.529082
Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.325162	valid_0's auc: 0.529112
Fold: 7


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325588	valid_0's auc: 0.52843
[20]	valid_0's binary_logloss: 0.325357	valid_0's auc: 0.528756
[30]	valid_0's binary_logloss: 0.325256	valid_0's auc: 0.52879
[40]	valid_0's binary_logloss: 0.325207	valid_0's auc: 0.528828
[50]	valid_0's binary_logloss: 0.325182	valid_0's auc: 0.528896
[60]	valid_0's binary_logloss: 0.325168	valid_0's auc: 0.528898
[70]	valid_0's binary_logloss: 0.32516	valid_0's auc: 0.528916
[80]	valid_0's binary_logloss: 0.325156	valid_0's auc: 0.528905
[90]	valid_0's binary_logloss: 0.325153	valid_0's auc: 0.528885
Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.325161	valid_0's auc: 0.528919
Fold: 8


New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325588	valid_0's auc: 0.527749
[20]	valid_0's binary_logloss: 0.325357	valid_0's auc: 0.527982
[30]	valid_0's binary_logloss: 0.325254	valid_0's auc: 0.528093
[40]	valid_0's binary_logloss: 0.325205	valid_0's auc: 0.528161
[50]	valid_0's binary_logloss: 0.325179	valid_0's auc: 0.528225
[60]	valid_0's binary_logloss: 0.325166	valid_0's auc: 0.528289
[70]	valid_0's binary_logloss: 0.325158	valid_0's auc: 0.528297
[80]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.528293
[90]	valid_0's binary_logloss: 0.325152	valid_0's auc: 0.528314
[100]	valid_0's binary_logloss: 0.32515	valid_0's auc: 0.528304
[110]	valid_0's binary_logloss: 0.325148	valid_0's auc: 0.528324
[120]	valid_0's binary_logloss: 0.325147	valid_0's auc: 0.528341
[130]	valid_0's binary_logloss: 0.325147	valid_0's auc: 0.528346
[140]	valid_0's binary_logloss: 0.325146	valid_0's auc: 0.52835
[150]	valid_0's binary_logloss: 0.32514

New categorical_feature is [2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 25 rounds.
[10]	valid_0's binary_logloss: 0.325586	valid_0's auc: 0.528233
[20]	valid_0's binary_logloss: 0.325355	valid_0's auc: 0.528428
[30]	valid_0's binary_logloss: 0.325251	valid_0's auc: 0.528602
[40]	valid_0's binary_logloss: 0.325202	valid_0's auc: 0.52866
[50]	valid_0's binary_logloss: 0.325176	valid_0's auc: 0.52869
[60]	valid_0's binary_logloss: 0.325163	valid_0's auc: 0.52873
[70]	valid_0's binary_logloss: 0.325154	valid_0's auc: 0.528744
[80]	valid_0's binary_logloss: 0.325149	valid_0's auc: 0.528766
[90]	valid_0's binary_logloss: 0.325146	valid_0's auc: 0.528779
[100]	valid_0's binary_logloss: 0.325144	valid_0's auc: 0.528788
[110]	valid_0's binary_logloss: 0.325144	valid_0's auc: 0.528771
[120]	valid_0's binary_logloss: 0.325143	valid_0's auc: 0.528789
[130]	valid_0's binary_logloss: 0.325142	valid_0's auc: 0.528801
[140]	valid_0's binary_logloss: 0.325142	valid_0's auc: 0.528824
[150]	valid_0's binary_logloss: 0.325141

In [14]:
ypred = np.zeros( (200000,200) )

for feat,var in enumerate(features):
    tmp = var_to_feat(test_df[var], var_stats[var], int(var[4:]) )
    tmp[:,0] = (tmp[:,0]-var_mean[var])/var_var[var]
    for model_id in range(10):
        model = MODELS[model_id]
        ypred[:,feat] += model.predict_proba( tmp )[:,1] / 10.
ypred = np.mean( logit(ypred), axis=1 )

sub = test_df[['ID_code']]
sub['target'] = ypred
sub['target'] = sub['target'].rank() / 200000.
sub.to_csv('golden_sub.csv', index=False)
print( sub.head(10) )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


  ID_code    target
0  test_0  0.823130
1  test_1  0.883000
2  test_2  0.858360
3  test_3  0.858905
4  test_4  0.751590
5  test_5  0.064065
6  test_6  0.104270
7  test_7  0.639755
8  test_8  0.062035
9  test_9  0.163630
