In [1]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM



In [2]:
df_train = pd.read_csv('new_train.csv')

In [3]:
names = df_train.columns.values.tolist()[:-1]

In [4]:
df_test = pd.read_csv('test.csv', names=names)

In [5]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [7]:
# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])
df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [9]:
y_train = df_train['TARGET'].values
X_train = df_train.drop(['TARGET'], axis=1).values

X_test = df_test.values

# length of dataset
len_train = len(X_train)
len_test  = len(X_test)

In [10]:
print (len_train, len_test)

60020 16000


In [12]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5,
                        n_estimators=350, learning_rate=0.03, 
                        nthread=-1, subsample=0.95, colsample_bytree=0.85, seed=4242)

X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [13]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])

[0]	validation_0-auc:0.818978
Will train until validation_0-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.828184
[2]	validation_0-auc:0.833406
[3]	validation_0-auc:0.833371
[4]	validation_0-auc:0.834036
[5]	validation_0-auc:0.833722
[6]	validation_0-auc:0.834198
[7]	validation_0-auc:0.839992
[8]	validation_0-auc:0.841152
[9]	validation_0-auc:0.841298
[10]	validation_0-auc:0.841235
[11]	validation_0-auc:0.84277
[12]	validation_0-auc:0.84268
[13]	validation_0-auc:0.84435
[14]	validation_0-auc:0.843755
[15]	validation_0-auc:0.845637
[16]	validation_0-auc:0.846405
[17]	validation_0-auc:0.847229
[18]	validation_0-auc:0.847426
[19]	validation_0-auc:0.847847
[20]	validation_0-auc:0.847999
[21]	validation_0-auc:0.848085
[22]	validation_0-auc:0.849025
[23]	validation_0-auc:0.84947
[24]	validation_0-auc:0.849518
[25]	validation_0-auc:0.84974
[26]	validation_0-auc:0.84975
[27]	validation_0-auc:0.849513
[28]	validation_0-auc:0.849626
[29]	validation_0-auc:0.850056
[30]	validation_0-auc:

[259]	validation_0-auc:0.884311
[260]	validation_0-auc:0.884328
[261]	validation_0-auc:0.884345
[262]	validation_0-auc:0.884373
[263]	validation_0-auc:0.884464
[264]	validation_0-auc:0.884507
[265]	validation_0-auc:0.884692
[266]	validation_0-auc:0.884715
[267]	validation_0-auc:0.884762
[268]	validation_0-auc:0.884841
[269]	validation_0-auc:0.884873
[270]	validation_0-auc:0.884894
[271]	validation_0-auc:0.884902
[272]	validation_0-auc:0.885075
[273]	validation_0-auc:0.885256
[274]	validation_0-auc:0.885318
[275]	validation_0-auc:0.885344
[276]	validation_0-auc:0.885359
[277]	validation_0-auc:0.885494
[278]	validation_0-auc:0.885692
[279]	validation_0-auc:0.885715
[280]	validation_0-auc:0.885783
[281]	validation_0-auc:0.885861
[282]	validation_0-auc:0.886056
[283]	validation_0-auc:0.886243
[284]	validation_0-auc:0.88638
[285]	validation_0-auc:0.886416
[286]	validation_0-auc:0.886511
[287]	validation_0-auc:0.886646
[288]	validation_0-auc:0.886725
[289]	validation_0-auc:0.886732
[290]	val

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [14]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

Overall AUC: 0.888682817864


In [15]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [20]:
submission = pd.DataFrame(data=y_pred)
submission.to_csv("submission.csv", index=False, header=None)

In [21]:
y_pred

array([ 0.01587451,  0.00440938,  0.02592113, ...,  0.02741839,
        0.00851961,  0.03069277], dtype=float32)

In [22]:
y_pred.shape

(16000,)

In [23]:
submission

Unnamed: 0,0
0,0.015875
1,0.004409
2,0.025921
3,0.105370
4,0.084023
5,0.052264
6,0.002626
7,0.045652
8,0.005847
9,0.008421
