In [1]:
! free

              total        used        free      shared  buff/cache   available
Mem:       16432484      688724    15472584        5360      271176    15417664
Swap:             0           0           0


In [2]:
from keras.layers import Dense, Dropout, Embedding, Flatten, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from time import time
import datetime
from keras.models import Model
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from util import Gini, interaction_features
from itertools import combinations
from util import proj_num_on_cat
from scipy import sparse
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [3]:
cv_only = True
save_cv = True

NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

train = pd.read_csv("/input/data/train.csv")
train_label = train['target']
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv("/input/data/test.csv")
test_id = test['id']
del test['id']

In [4]:
cat_fea = [x for x in list(train) if 'cat' in x]
bin_fea = [x for x in list(train) if 'bin' in x]

train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

In [5]:
# include interactions
for e, (x, y) in enumerate(combinations(['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01'], 2)):
    train, test = interaction_features(train, test, x, y, e)

num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)]
num_features.append('missing')
inter_fea = [x for x in list(train) if 'inter' in x]

feature_names = list(train)
ind_features = [c for c in feature_names if 'ind' in c]
count = 0
for c in ind_features:
    if count == 0:
        train['new_ind'] = train[c].astype(str)
        count += 1
    else:
        train['new_ind'] += '_' + train[c].astype(str)

ind_features = [c for c in feature_names if 'ind' in c]
count = 0
for c in ind_features:
    if count == 0:
        test['new_ind'] = test[c].astype(str)
        count += 1
    else:
        test['new_ind'] += '_' + test[c].astype(str)

reg_features = [c for c in feature_names if 'reg' in c]
count = 0
for c in reg_features:
    if count == 0:
        train['new_reg'] = train[c].astype(str)
        count += 1
    else:
        train['new_reg'] += '_' + train[c].astype(str)

reg_features = [c for c in feature_names if 'reg' in c]
count = 0
for c in reg_features:
    if count == 0:
        test['new_reg'] = test[c].astype(str)
        count += 1
    else:
        test['new_reg'] += '_' + test[c].astype(str)

car_features = [c for c in feature_names if 'car' in c]
count = 0
for c in car_features:
    if count == 0:
        train['new_car'] = train[c].astype(str)
        count += 1
    else:
        train['new_car'] += '_' + train[c].astype(str)

car_features = [c for c in feature_names if 'car' in c]
count = 0
for c in car_features:
    if count == 0:
        test['new_car'] = test[c].astype(str)
        count += 1
    else:
        test['new_car'] += '_' + test[c].astype(str)


In [6]:
train_cat = train[cat_fea]
train_num = train[[x for x in list(train) if x in num_features]]
test_cat = test[cat_fea]
test_num = test[[x for x in list(train) if x in num_features]]

max_cat_values = []
for c in cat_fea:
    le = LabelEncoder()
    x = le.fit_transform(pd.concat([train_cat, test_cat])[c])
    train_cat[c] = le.transform(train_cat[c])
    test_cat[c] = le.transform(test_cat[c])
    max_cat_values.append(np.max(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# xgboost prediction
train_fea0, test_fea0 = pickle.load(open("/input/data/fea0.pk"))

cat_count_features = []
for c in cat_fea + ['new_ind','new_reg','new_car']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)


print(train_num.dtypes)
train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features], train_fea0]
test_list = [test_num.replace([np.inf, -np.inf, np.nan], 0), test[cat_count_features], test_fea0]

del train_num,test_num
del train_fea0,test_fea0

ps_ind_01          int64
ps_ind_03          int64
ps_ind_06_bin      int64
ps_ind_07_bin      int64
ps_ind_08_bin      int64
ps_ind_09_bin      int64
ps_ind_10_bin      int64
ps_ind_11_bin      int64
ps_ind_12_bin      int64
ps_ind_13_bin      int64
ps_ind_14          int64
ps_ind_15          int64
ps_ind_16_bin      int64
ps_ind_17_bin      int64
ps_ind_18_bin      int64
ps_reg_01        float64
ps_reg_02        float64
ps_reg_03        float64
ps_car_11          int64
ps_car_12        float64
ps_car_13        float64
ps_car_14        float64
ps_car_15        float64
missing          float64
inter_0*         float64
inter_0/         float64
inter_1*         float64
inter_1/         float64
inter_2*         float64
inter_2/         float64
inter_3*         float64
inter_3/         float64
inter_4*         float64
inter_4/         float64
inter_5*         float64
inter_5/         float64
inter_6*           int64
inter_6/         float64
inter_7*         float64
inter_7/         float64


In [8]:
#feature aggregation
for t in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01']:
    for g in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat']:
        if t != g:
            s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
            train_list.append(s_train)
            test_list.append(s_test)
            del s_train,s_test

((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))
((595212, 6), (892816, 6))


In [9]:
! free

              total        used        free      shared  buff/cache   available
Mem:       16432484     7255776     7057624        5380     2119084     8841504
Swap:             0           0           0


In [13]:
sparse.hstack(train_list)

<595212x325 sparse matrix of type '<type 'numpy.float64'>'
	with 170600225 stored elements in COOrdinate format>

X = sparse.hstack(train_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()

all_data = np.vstack([X.toarray(), X_test.toarray()])
scaler = StandardScaler()
scaler.fit(all_data)
X = scaler.transform(X.toarray())
X_test = scaler.transform(X_test.toarray())
print(X.shape, X_test.shape)