In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
xtrain = pd.read_csv('data/train.csv.gz', index_col='ID')
xtest = pd.read_csv('data/test.csv.gz', index_col='ID')
train_st = pd.read_csv('data/train_stack_c.csv', index_col='ID')
test_st = pd.read_csv('data/test_stack_c.csv', index_col='ID')
target = xtrain.TARGET.values

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [4]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [6]:
cw = cross_val_score(LogisticRegression(class_weight='balanced'), train_st, target, scoring='roc_auc', cv=skf)

In [7]:
cw.mean()

0.84193036342854377

In [8]:
cnw = cross_val_score(LogisticRegression(), train_st, target, scoring='roc_auc', cv=skf)

In [9]:
cnw.mean()

0.84229804862641733

In [10]:
sc = StandardScaler()
train_sc = sc.fit_transform(train_st)

In [11]:
cws = cross_val_score(LogisticRegression(class_weight='balanced'), train_sc, target, scoring='roc_auc', cv=skf)

In [12]:
cws.mean()

0.84186738145819062

In [13]:
cnws = cross_val_score(LogisticRegression(), train_sc, target, scoring='roc_auc', cv=skf)

In [14]:
cnws.mean()

0.84231972213527206

In [21]:
tr_sc = sc.fit_transform(tr)

In [22]:
cross_val_score(LogisticRegression(), tr_sc, target, scoring='roc_auc', cv=skf)

array([ 0.83957451,  0.8382048 ,  0.81839287,  0.84335259,  0.8489917 ,
        0.84539095,  0.84902787,  0.86284203,  0.84995069,  0.82994247])

In [15]:
import xgboost as xgb

In [25]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 3,
          'subsample': 1.,
          'colsample_bytree': 1.,
          'silent': 1,
          'seed': 0
}

In [17]:
tr = train_st.copy()

In [18]:
for i in range(train_st.shape[1]):
    for j in range(i+1, train_st.shape[1]):
        tr.loc[:, tr.columns[i]+'@+'+tr.columns[j]] = tr.ix[:, i] + tr.ix[:, j]
        tr.loc[:, tr.columns[i]+'@-'+tr.columns[j]] = tr.ix[:, i] - tr.ix[:, j]
        tr.loc[:, tr.columns[i]+'@*'+tr.columns[j]] = tr.ix[:, i] * tr.ix[:, j]

In [19]:
tr_w = np.ones(target.shape)
tr_w[target == 1] = 25.

In [26]:
dtrain = xgb.DMatrix(tr, target, weight=tr_w)

In [27]:
xgb.cv(params, dtrain, num_boost_round=2500, nfold=10, stratified=True, early_stopping_rounds=150, verbose_eval=True)

Will train until cv error hasn't decreased in 150 rounds.
[0]	cv-test-auc:0.8356220000000001+0.010661403500477786	cv-train-auc:0.8407762999999999+0.0014512053645159956
[1]	cv-test-auc:0.8361438999999999+0.011034106682917303	cv-train-auc:0.8412272+0.0016183987642111044
[2]	cv-test-auc:0.8368285999999999+0.011029402125228724	cv-train-auc:0.8418903999999999+0.0017296365051651863
[3]	cv-test-auc:0.8370866000000001+0.011097329986983344	cv-train-auc:0.8421294000000001+0.0016443932741287868
[4]	cv-test-auc:0.8374801999999999+0.01130866823989458	cv-train-auc:0.8424422000000001+0.0015215344097324881
[5]	cv-test-auc:0.8380411999999999+0.010986995392735894	cv-train-auc:0.8428305999999999+0.001684642466519215
[6]	cv-test-auc:0.8383584+0.011394243328979783	cv-train-auc:0.8432502+0.001238844849042855
[7]	cv-test-auc:0.8385617999999999+0.011342740989725549	cv-train-auc:0.8434303999999999+0.0012700465503279752
[8]	cv-test-auc:0.8385811999999999+0.011144853420301236	cv-train-auc:0.8435501999999999+0.00

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.835622,0.010661,0.840776,0.001451
1,0.836144,0.011034,0.841227,0.001618
2,0.836829,0.011029,0.841890,0.001730
3,0.837087,0.011097,0.842129,0.001644
4,0.837480,0.011309,0.842442,0.001522
5,0.838041,0.010987,0.842831,0.001685
6,0.838358,0.011394,0.843250,0.001239
7,0.838562,0.011343,0.843430,0.001270
8,0.838581,0.011145,0.843550,0.001308
9,0.838937,0.011372,0.843750,0.001184


In [25]:
ts = test_st.copy()

In [26]:
for i in range(test_st.shape[1]):
    for j in range(i+1, test_st.shape[1]):
        ts.loc[:, ts.columns[i]+'@+'+ts.columns[j]] = ts.ix[:, i] + ts.ix[:, j]
        ts.loc[:, ts.columns[i]+'@-'+ts.columns[j]] = ts.ix[:, i] - ts.ix[:, j]
        ts.loc[:, ts.columns[i]+'@*'+ts.columns[j]] = ts.ix[:, i] * ts.ix[:, j]

In [27]:
tr_sc = sc.fit_transform(tr)
ts_sc = sc.transform(ts)

In [28]:
lr = LogisticRegression()
lr.fit(tr_sc, target)
preds = lr.predict_proba(ts_sc)[:, 1]

In [29]:
sample = pd.read_csv('data/sample_submission.csv', index_col='ID')
sample.TARGET = preds
sample.to_csv('submission/xgb_lrst_nopost.csv', index_label='ID')

In [30]:
preds[(xtest['var15'] < 23).values] = 0
preds[(xtest['saldo_medio_var5_hace2'] > 160000).values] = 0
preds[(xtest['saldo_var33'] > 0).values] = 0
var38 = xtest['var38']
V21 = xtest['var21']
NV=xtest['num_var33']+xtest['saldo_medio_var33_ult3']+xtest['saldo_medio_var44_hace2']+\
xtest['saldo_medio_var44_hace3']+xtest['saldo_medio_var33_ult1']+xtest['saldo_medio_var44_ult1']
preds[var38 > 3988596]=0
preds[NV>0]=0
preds[V21>7500]=0

fbinfeats = ['ind_var6_0', 'ind_var6', 'ind_var13_medio_0', 'ind_var13_medio',
       'ind_var18_0', 'ind_var18', 'ind_var20_0', 'ind_var20',
       'ind_var29_0', 'ind_var29', 'ind_var33_0', 'ind_var33',
       'ind_var34_0', 'ind_var34', 'num_var6_0', 'num_var6',
       'num_var13_medio_0', 'num_var13_medio', 'num_var18_0', 'num_var18',
       'num_var20_0', 'num_var20', 'num_op_var40_hace3', 'num_var29_0',
       'num_var29', 'num_var33_0', 'num_var33', 'num_var34_0', 'num_var34',
       'saldo_var6', 'saldo_var13_medio', 'saldo_var18', 'saldo_var20',
       'saldo_var29', 'saldo_var33', 'saldo_var34',
       'delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3',
       'delta_imp_aport_var33_1y3', 'delta_imp_reemb_var33_1y3',
       'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3',
       'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3',
       'delta_imp_venta_var44_1y3', 'delta_num_aport_var33_1y3',
       'delta_num_reemb_var33_1y3', 'delta_num_trasp_var17_in_1y3',
       'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_in_1y3',
       'delta_num_trasp_var33_out_1y3', 'delta_num_venta_var44_1y3',
       'imp_amort_var18_ult1', 'imp_amort_var34_ult1',
       'imp_aport_var17_hace3', 'imp_aport_var33_hace3',
       'imp_aport_var33_ult1', 'imp_var7_emit_ult1',
       'imp_compra_var44_hace3', 'imp_reemb_var17_hace3',
       'imp_reemb_var33_ult1', 'imp_trasp_var17_in_hace3',
       'imp_trasp_var17_in_ult1', 'imp_trasp_var17_out_ult1',
       'imp_trasp_var33_in_hace3', 'imp_trasp_var33_in_ult1',
       'imp_trasp_var33_out_ult1', 'imp_venta_var44_hace3',
       'imp_venta_var44_ult1', 'ind_var7_emit_ult1',
       'num_aport_var17_hace3', 'num_aport_var33_hace3',
       'num_aport_var33_ult1', 'num_var7_emit_ult1',
       'num_compra_var44_hace3', 'num_meses_var13_largo_ult3',
       'num_meses_var13_medio_ult3', 'num_meses_var29_ult3',
       'num_meses_var33_ult3', 'num_reemb_var17_hace3',
       'num_reemb_var33_ult1', 'num_trasp_var17_in_hace3',
       'num_trasp_var17_in_ult1', 'num_trasp_var17_out_ult1',
       'num_trasp_var33_in_hace3', 'num_trasp_var33_in_ult1',
       'num_trasp_var33_out_ult1', 'num_venta_var44_hace3',
       'num_venta_var44_ult1', 'saldo_medio_var13_largo_hace2',
       'saldo_medio_var13_largo_hace3', 'saldo_medio_var13_largo_ult1',
       'saldo_medio_var13_largo_ult3', 'saldo_medio_var13_medio_hace2',
       'saldo_medio_var13_medio_ult1', 'saldo_medio_var13_medio_ult3',
       'saldo_medio_var17_hace2', 'saldo_medio_var17_hace3',
       'saldo_medio_var29_hace2', 'saldo_medio_var29_hace3',
       'saldo_medio_var29_ult1', 'saldo_medio_var29_ult3',
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3']

for f in fbinfeats:
    preds[(xtest[f] != 0).values] = 0



In [31]:
sample = pd.read_csv('data/sample_submission.csv', index_col='ID')
sample.TARGET = preds
sample.to_csv('submission/xgb_lrst.csv', index_label='ID')