In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost
from sklearn.cross_validation import train_test_split, KFold
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import log_loss
%matplotlib inline

import lasagne
import theano
import theano.tensor as T
from lasagne.layers import *

In [2]:
data = pd.read_csv('train.csv', sep=';')
Xtest = pd.read_csv('test.csv', sep=';')

In [3]:
Xtest = Xtest.set_index('id')
X = data.drop(['cardio', 'id'], axis=1)
y = data.cardio

In [5]:
X.age = (X.age / 365).apply(int)
Xtest.age = (Xtest.age / 365).apply(int)

In [6]:
X['strange'] = (X.ap_hi < 60) | (X.ap_hi>300) | (X.ap_lo < 40) | (X.ap_lo > 500)
Xtest['strange'] = (Xtest.ap_hi < 60) | (Xtest.ap_hi>300) | (Xtest.ap_lo < 40) | (Xtest.ap_lo > 500)

In [7]:
replace1 = {(1,2088):(120,88),
            (1,1088):(110,88),
            (10,160):(100,60),
            (11,120):(120,80),
            (12,0):(120,80),
            (12,140):(120,80),
            (13,0):(130,90),
            (13,585):(135,85),
            (16,10):(160,100),
            (20,170):(120,70),
            (24,20):(120,80),
            (309,0):(130,90),
            (806,0):(80,60),
            (906,0):(90,60),
            (130,0):(130,90),
            (138,0):(130,80),
            (149,0):(140,90),
            (140,0):(140,90),
            (907,0):(90,70),
            (108,0):(100,80),
            (121,0):(120,80),
            (13,0):(130,80),
            (120,0):(120,80),
            (117,0):(120,80),
            (70,15):(120,80),
            (1,1099):(110,99),
            (1,30):(120,80),
            (10,0):(100,60),
            (11,570):(115,70),
            (16,10):(160,100),
            (17,12):(170,120),
            (400,60):(120,80),
            (509,0):(150,90),
            (16020,70):(120,80),
            (180,0):(100,80)
           }
replace2 = {-150:150,
            -140:140,
            -120:120,
            -115:115,
            -100:100,
            10:100,
            11:110,
            12:120,
            13:130,
            14:140,
            15:150,
            16:160,
            17:170,
            19:190,
            20:120,
            401:140,
            701:170,
            902:90,
            906:90,
            1110:110,
            1130:130,
            1202:120,
            1205:120,
            1300:130,
            1400:140,
            1409:140,
            1420:120,
            1500:150,
            1620:120,
            2000:200,
            11020:110,
            11500:115,
            13010:130,
            14020:140,
            16020:160,
            -130:130,
            -12:120,
            907:90,
            957:95,
            1407:140,
            1502:150,
            1608:160,
            12008:120,
            14020:140,
            14900:140}
replace3 = {-70:70,
            1:80,
            6:60,
            7:70,
            8:80,
            9:90,
            10:100,
            20:80,
            30:80,
            190:90,
            602:60,
            700:70,
            708:70,
            709:70,
            710:70,
            800:80,
            801:80,
            802:80,
            809:80,
            810:80,
            820:80,
            850:80,
            870:80,
            880:80,
            900:90,
            901:90,
            902:90,
            910:90,
            1000:100,
            1001:100,
            1002:100,
            1003:100,
            1007:100,
            1004:100,
            1099:100,
            1066:100,
            1009:100,
            1008:100,
            1011:100,
            1022:100,
            1033:100,
            1044:100,
            1077:100,
            1088:88,
            1100:110,
            1101:110,
            1110:110,
            1111:110,
            1120:120,
            1125:110,
            1130:110,
            1139:140,
            1140:140,
            1177:117,
            1200:120,
            1211:121,
            1300:130,
            1400:140,
            1900:90,
            5700:57,
            6800:68,
            7099:100,
            7100:100,
            8000:80,
            8044:80,
            8077:80,
            8079:80,
            8099:80,
            8100:100,
            8200:80,
            8500:80,
            9011:90,
            9100:100,
            9800:80,
            10000:100,
            11000:110,
            -90:90,
            12:120,
            19:80,
            20:80,
            30:80,
            701:70,
            808:80,
            4100:100,
            4700:70,
            8022:80}

In [8]:
for i in range(len(X)):
    k = (X.ap_hi.values[i], X.ap_lo.values[i])
    if k in replace1:
        (X.ap_hi.values[i], X.ap_lo.values[i]) = replace1[k]
        
for i in range(len(Xtest)):
    k = (Xtest.ap_hi.values[i], Xtest.ap_lo.values[i])
    if k in replace1:
        (Xtest.ap_hi.values[i], Xtest.ap_lo.values[i]) = replace1[k]

In [9]:
X.ap_hi = X.ap_hi.replace(replace2)
Xtest.ap_hi = Xtest.ap_hi.replace(replace2)
X.ap_lo = X.ap_lo.replace(replace3)
Xtest.ap_lo = Xtest.ap_lo.replace(replace3)

In [10]:
X['weird_feature'] = X.ap_hi % 10 == 0
Xtest['weird_feature'] = Xtest.ap_hi % 10 == 0

In [11]:
index = X.ap_hi < X.ap_lo
tmp = X.ap_hi[index]
X.loc[index, 'ap_hi'] = X.ap_lo[index]
X.loc[index, 'ap_lo'] = tmp

In [12]:
index = Xtest.ap_hi < Xtest.ap_lo
tmp = Xtest.ap_hi[index]
Xtest.loc[index, 'ap_hi'] = Xtest.ap_lo[index]
Xtest.loc[index, 'ap_lo'] = tmp

In [13]:
def create_linear(X):
    columns = ['cholesterol', 'gluc', 'ap_hi', 'age']
    Xtmp = X[['age']]
    X_linear = pd.get_dummies(X, columns=columns)
    X_linear = pd.concat([X_linear, Xtmp], axis=1)
    return minmax_scale(X_linear)

In [14]:
Xtest.alco = Xtest.alco.replace('None', 0).astype('int64')
Xtest.smoke = Xtest.smoke.replace('None', 0).astype('int64')
Xtest.active = Xtest.active.replace('None', 1).astype('int64')

In [16]:
X_tmp = create_linear(pd.concat([X, Xtest]))
X_linear = X_tmp[:len(X)]
Xtest_linear = X_tmp[len(X):]

In [17]:
def iterate_minibatches(*arrays,**kwargs):
    
    batchsize=kwargs.get("batchsize",100)
    shuffle = kwargs.get("shuffle",True)
    
    if shuffle:
        indices = np.arange(len(arrays[0]))
        np.random.shuffle(indices)
    for start_idx in range(0, len(arrays[0]) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield [arr[excerpt] for arr in arrays]

def proba_from_nn(Xtrain, Xtest, ytrain):
    
    ytrain = np.array(ytrain).astype('int32')
    input_X = T.matrix()
    target_y = T.vector(dtype='int32')
    input_shape = (None, Xtrain.shape[1])
    
    nn = InputLayer(shape =input_shape, input_var=input_X)
    nn = DenseLayer(nn, num_units=5, nonlinearity=lasagne.nonlinearities.sigmoid)
    nn = DenseLayer(nn, num_units=2, nonlinearity=lasagne.nonlinearities.softmax)
    
    y_predicted     = get_output(nn)
    all_weights     = get_all_params(nn, trainable=True)
    loss            = lasagne.objectives.categorical_crossentropy(y_predicted, target_y).mean()
    updates         = lasagne.updates.adamax(loss, all_weights)
    train_fun       = theano.function([input_X, target_y],loss, updates= updates)
    loss_fun        = theano.function([input_X, target_y], loss)
    pred            = theano.function([input_X], y_predicted)
    
    batch_size = 50
    for epoch in range(100):
        train_err = 0
        train_batches = 0
        for batch in iterate_minibatches(Xtrain, ytrain, batchsize=batch_size):
            inputs, targets = batch
            train_err_batch = train_fun(inputs, targets)
            train_err += train_err_batch
            train_batches += 1
    return pred(Xtest)[:, 1]

In [19]:
param_eta = [0.01]
param_max_depth = [4, 5, 6]
param_subsample = [0.7, 0.8, 0.9]
param_colsample_bytree = [0.8, 0.9]
param_seed = [1, 2, 3]

In [20]:
%%time
all_params = []
params = {'objective':'binary:logistic',
         'booster':'gbtree',
         'nthread':8,
         'eval_metric':'logloss'}
xgall = xgboost.DMatrix(X, label=y)
i = 0
for eta in param_eta:
    for max_depth in param_max_depth:
        for subsample in param_subsample:
            for colsample_bytree in param_colsample_bytree:
                for seed in param_seed:
                    params['seed'] = seed
                    params['eta'] = eta
                    params['max_depth'] = max_depth
                    params['subsample'] = subsample
                    params['colsample_bytree'] = colsample_bytree
                    dataframe = xgboost.cv(params=list(params.items()), 
                                          early_stopping_rounds=50,
                                          dtrain=xgall,
                                          nfold=10,
                                          num_boost_round=10000)
                    num_boost_round = len(dataframe)
                    params['boost_round'] = num_boost_round
                    all_params.append(params.copy())
                    print(i)
                    i += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
CPU times: user 8h 41min 47s, sys: 52min 15s, total: 9h 34min 2s
Wall time: 2h 4min 51s


In [21]:
p1_test = np.mean([proba_from_nn(X_linear, Xtest_linear, y) for i in range(20)], axis=0)

In [22]:
%%time
i = 0
proba_test = []
for params in all_params:
    num_boost_round = params['boost_round']
    xgtrain = xgboost.DMatrix(X, label=y)
    xgtest = xgboost.DMatrix(Xtest)
    model = xgboost.train(params=list(params.items()), 
                         dtrain=xgtrain,
                         num_boost_round=num_boost_round)
    proba_test.append(model.predict(xgtest))
    print(i)
    i+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
CPU times: user 48min 29s, sys: 4min 13s, total: 52min 43s
Wall time: 11min 13s


In [23]:
p2_test = np.mean(proba_test, axis=0)

In [24]:
a = np.array([4, 1])
a = a/a.sum()
p = pd.DataFrame(a.dot([p1_test, p2_test]))

In [25]:
p.to_csv('solution.csv', header=None, index=None)