In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
#from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.ensemble import ExtraTreesClassifier


In [7]:
df_train = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")

print(df_test.shape)
print(df_train.shape)

(75818, 370)
(76020, 371)


In [8]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

print(df_test.shape)
print(df_train.shape)

(75818, 336)
(76020, 337)


In [9]:
df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [10]:
# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

y_train = df_train['TARGET'].values
X_train = df_train.drop(['ID','TARGET'], axis=1).values

id_test = df_test['ID']
X_test = df_test.drop(['ID'], axis=1).values

print(df_test.shape)
print(df_train.shape)

(75818, 307)
(76020, 308)


In [11]:
pol_feat = PolynomialFeatures(degree=2)
X_train_all = pol_feat.fit_transform(X_train)

print (X_train_all.shape)


(76020, 47278)


In [12]:
## Feature selection

clf = ExtraTreesClassifier()
clf = clf.fit(X_train_all, y_train)
clf.feature_importances_  

print ("ExtraTrees done")

ExtraTrees done


In [13]:
feature_importance = clf.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

print (sorted_idx[0:25])
feature_importance [sorted_idx[0:1000]]
feature_selected = sorted_idx[0:1000]

X_train = np.copy(X_train_all[:,feature_selected])

print (X_train.shape)

[30624 14637   917 27774 47277  6807   306   612 19547 35496 44649 18116
 17387 30441 29886     2   662   720   764   613   845   735   633   308
   682]
(76020, 1000)


In [14]:
del X_train_all
print (X_train.shape)

(76020, 1000)


In [15]:
pol_feat = PolynomialFeatures(2)
X_test_all = pol_feat.fit_transform(X_test)

X_test =  np.copy(X_test_all[:,feature_selected])
del X_test_all

print(X_test.shape)

(75818, 1000)


In [28]:
# length of dataset
len_train = len(X_train)
len_test  = len(X_test)

In [29]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=3, n_estimators=350, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=4242)

X_fit, X_eval_valid, y_fit, y_eval_valid = train_test_split(X_train, y_train, test_size=0.5, random_state = 84)

X_eval, X_valid, y_eval, y_valid = train_test_split(X_eval_valid, y_eval_valid, test_size=0.5, random_state = 84)

# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval_valid, y_eval_valid)])

print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

('Overall AUC:', 0.85354765758776474)


In [30]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("predictions/submission_xgb.csv", index=False)

print('Completed!')

Completed!


In [None]:
for c_max_depth in range (2,15,1):
    clf = xgb.XGBClassifier(missing=np.nan, max_depth = c_max_depth, n_estimators=350, learning_rate=0.03, nthread=-1, subsample=0.95, colsample_bytree=0.85, seed=4242)
    clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])
    
    print(c_max_depth,'  Overall AUC:', roc_auc_score(y_fit, clf.predict_proba(X_fit)[:,1]), roc_auc_score(y_eval, clf.predict_proba(X_eval)[:,1]))

(2, '  Overall AUC:', 0.79680410423403769, 0.78919688835323765)
(3, '  Overall AUC:', 0.8731979784119428, 0.83466936528294255)
(4, '  Overall AUC:', 0.88618379458910945, 0.83434158202853448)
(5, '  Overall AUC:', 0.9036899063426691, 0.83337632289557462)
(6, '  Overall AUC:', 0.92457437967621448, 0.8314306683160263)

In [None]:
# C_n_estimators_range = [0.65,0.7,0.73,0.74,0.75,0.76,0.77,0.8,0.85,0.9,0.95]
C_n_estimators_range = [0.8,0.81,0.82,0.83,0.84,0.85,0.86,0.87,0.88,0.9,0.95]

for c_n_estim  in C_n_estimators_range:
    clf = xgb.XGBClassifier(missing=np.nan, max_depth = 3, n_estimators=350, learning_rate=0.03, nthread=-1, subsample=0.75, colsample_bytree=0.87, seed=4242)
    clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])
    
    print(c_n_estim,'  Overall AUC:', roc_auc_score(y_fit, clf.predict_proba(X_fit)[:,1]), roc_auc_score(y_eval, clf.predict_proba(X_eval)[:,1]))

In [22]:
C_feat_numb = [30,40,50,70,80,90,100,150,200,250,300,350,400,450,500,550,600,650]

for c_feat  in C_feat_numb:
    clf = xgb.XGBClassifier(missing=np.nan, max_depth = 3, n_estimators=1550, learning_rate=0.03, nthread=-1, subsample=0.75, colsample_bytree=0.87, seed=4242)
    clf.fit(X_fit[:,1:c_feat], y_fit, early_stopping_rounds=50, eval_metric="auc", eval_set=[(X_eval[:,1:c_feat], y_eval)])
    
    print(c_feat,'  Overall AUC:', roc_auc_score(y_fit, clf.predict_proba(X_fit[:,1:c_feat])[:,1]), roc_auc_score(y_eval, clf.predict_proba(X_eval[:,1:c_feat])[:,1]))

(30, '  Overall AUC:', 0.83780763645115874, 0.80242453106970923)
(40, '  Overall AUC:', 0.87374266972663617, 0.82438535260024359)
(50, '  Overall AUC:', 0.87230451275021303, 0.83224438194747685)
(70, '  Overall AUC:', 0.86935837368694813, 0.83105893504252826)
(80, '  Overall AUC:', 0.86862705385409744, 0.83154300529428138)
(90, '  Overall AUC:', 0.87317910576131741, 0.83021243214372364)
(100, '  Overall AUC:', 0.87557587693730488, 0.83199530752393691)
(150, '  Overall AUC:', 0.87425042905705008, 0.83046683163181712)
(200, '  Overall AUC:', 0.87050891220722049, 0.83076040317012034)
(250, '  Overall AUC:', 0.87923877930808503, 0.83092631905131575)
(300, '  Overall AUC:', 0.87451042246289801, 0.8316189057006933)
(350, '  Overall AUC:', 0.87015052593234354, 0.83079581849670658)
(400, '  Overall AUC:', 0.87463323332553711, 0.83317090670676552)
(450, '  Overall AUC:', 0.87224474319507395, 0.8313149758176418)
(500, '  Overall AUC:', 0.87242049359874263, 0.83383234537580875)
(550, '  Overall A

In [23]:
clf = xgb.XGBClassifier(missing=np.nan, max_depth = 3, n_estimators=350, learning_rate=0.03, nthread=-1, subsample=0.75, colsample_bytree=0.87, seed=4242)
clf.fit(X_fit[:,1:650], y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval[:,1:650], y_eval)])
  

# predicting
y_pred= clf.predict_proba(X_test[:,1:650])[:,1]

submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("predictions/submission_xgb.csv", index=False)

print('Completed!')

Completed!


In [32]:
print (sum(y_fit==1)*1.0/len(y_fit))
print (len(y_fit), sum(y_fit==1))
print (sum(y_eval==1)*1.0/len(y_eval))
print (len(y_eval), sum(y_eval==1))

0.0389634306761
(38010, 1481)
0.039515916864
(19005, 751)
