In [137]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
#from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedKFold 
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import describe

In [138]:
df_train = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")

print(df_test.shape)
print(df_train.shape)

(75818, 370)
(76020, 371)


In [139]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

print(df_test.shape)
print(df_train.shape)

(75818, 336)
(76020, 337)


In [140]:
df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [141]:
# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

y_train = df_train['TARGET'].values
X_train = df_train.drop(['ID','TARGET'], axis=1).values

id_test = df_test['ID']
X_test = df_test.drop(['ID'], axis=1).values

print(df_test.shape)
print(df_train.shape)

(75818, 307)
(76020, 308)


In [147]:
num_vals = (X_train[:,0]==-999999)
print (sum(num_vals))
X_train[:,0] = 2

0


In [143]:
# Adding log features

c = X_train.shape[1]

for i in range(c):
    if sum(X_train[:,i]<0)==0:
        new_col = np.log(X_train[:,i]+1)
        X_train = np.column_stack((X_train, new_col))
        
        new_col = np.log(X_test[:,i]+1)
        X_test = np.column_stack((X_test, new_col))
        
print(X_test.shape)
print(X_train.shape)

(75818, 585)
(76020, 585)


In [144]:
## Feature selection

clf = ExtraTreesClassifier()
clf = clf.fit(X_train, y_train)
feature_selected = clf.feature_importances_

print ("ExtraTrees done")

ExtraTrees done


In [145]:
feature_selected = np.argsort(-feature_selected)
print (feature_selected[0:30])

[584 305 307   1 270 268 266 553 264 551 267 263 550 143 269 221 128 150
 508 510 552 223 265 224 506 422 218 452 219 505]


In [146]:
# length of dataset
len_train = len(X_train)
len_test  = len(X_test)

In [149]:
# Looking for outlier
num_vals = (X_train[:,0]==-999999)
print (sum(num_vals))

# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=3, n_estimators=350, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=4242)

X_fit, X_eval, y_fit, y_eval = train_test_split(X_train, y_train, test_size=0.5, random_state = 84)

# fitting
#clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])

clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])

print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]), roc_auc_score(y_eval, clf.predict_proba(X_eval)[:,1]))

0
('Overall AUC:', 0.85570427034145358, 0.85818548567829711)


In [151]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("predictions/submission_xgb.csv", index=False)

print('Completed!')

Completed!


In [11]:
## Learning with cross_validation

skf = StratifiedKFold (y_train,n_folds=5, random_state = 48)

rc_eval_dt = []
for fit_index, test_index in skf:
        # print("TRAIN:", fit_index, "TEST:", test_index)
        X_fit, X_eval = X_train[fit_index], X_train[test_index]
        y_fit, y_eval = y_train[fit_index], y_train[test_index]
        
        clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])
        
        rc_fit  = roc_auc_score(y_fit, clf.predict_proba(X_fit)[:,1])
        rc_eval = roc_auc_score(y_eval, clf.predict_proba(X_eval)[:,1])
        rc_train  = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
        
        rc_eval_dt.append(rc_eval)
        print (rc_fit,rc_eval,rc_train)

rc_eval_dt = np.array(rc_eval_dt)
print ("\n")
print ("results: ", rc_eval_dt.min(),rc_eval_dt.max(),rc_eval_dt.mean())

(0.82925183744912734, 0.81684513695019678, 0.82676223764149526)
(0.8387333682605963, 0.81403982661370033, 0.83378117410530594)
(0.85722421034670082, 0.842733223637958, 0.85432520938338907)
(0.85429906580611092, 0.8525390044123603, 0.85394502559547192)
(0.85567309707810935, 0.83600849244319786, 0.8517707345870571)


('results: ', 0.81403982661370033, 0.8525390044123603, 0.83243313681148268)


In [12]:
d_var = [50,75,100,125,150,175,200,225,250,275,300,325]

for d in d_var:
    skf = StratifiedKFold (y_train,n_folds=5, random_state = 48)

    rc_eval_dt = []
    for fit_index, test_index in skf:

            # print("TRAIN:", fit_index, "TEST:", test_index)
            X_fit, X_eval = X_train[fit_index][:,feature_selected[0:d]], X_train[test_index][:,feature_selected[0:d]]
            y_fit, y_eval = y_train[fit_index], y_train[test_index]
            
            clf = xgb.XGBClassifier(missing=np.nan, max_depth=3, n_estimators=550, learning_rate=0.03, nthread=8, subsample=0.95, colsample_bytree=0.85, seed=4242)
            clf.fit(X_fit, y_fit, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])    
            
            rc_fit  = roc_auc_score(y_fit, clf.predict_proba(X_fit)[:,1])
            rc_eval = roc_auc_score(y_eval, clf.predict_proba(X_eval)[:,1])
            rc_train  = roc_auc_score(y_train, clf.predict_proba(X_train[:,feature_selected[0:d]])[:,1])
            rc_eval_dt.append(rc_eval)
            
            print (rc_fit,rc_eval,rc_train)

    rc_eval_dt = np.array(rc_eval_dt)

    #print ("\n")
    print ("results: ",d," -- ",rc_eval_dt.min(),rc_eval_dt.max(),rc_eval_dt.mean())

(0.82581993002108967, 0.81269680625857832, 0.82319398494389151)
(0.86104645919428469, 0.82507184046967996, 0.85385450564596777)
(0.83039059245682523, 0.82318304141652632, 0.82894476104773207)
(0.83222389229268878, 0.84593146016740117, 0.83496609071694428)
(0.85769822822451824, 0.83669988224438052, 0.85352193817454669)
('results: ', 50, ' -- ', 0.81269680625857832, 0.84593146016740117, 0.82871660611131337)
(0.82733447416914574, 0.81358111915746611, 0.82458376213440876)
(0.85998455910990956, 0.82430600092867645, 0.85286227631919442)
(0.82385337019831295, 0.81647925396830456, 0.82237803274614718)
(0.85563373163944945, 0.85193541285457441, 0.85489688293370025)
(0.85948815896567154, 0.83640634781869516, 0.85489201316076291)
('results: ', 75, ' -- ', 0.81358111915746611, 0.85193541285457441, 0.82854162694554334)
(0.86151882117940148, 0.83478608705306312, 0.85617356255048715)
(0.86176085492922494, 0.82525998730975725, 0.85446189541780371)
(0.82026233281146843, 0.81221392099839773, 0.818650520

In [22]:
print (sum(y_fit==1)*1.0/len(y_fit))
print (len(y_fit), sum(y_fit==1))
print (sum(y_eval==1)*1.0/len(y_eval))
print (len(y_eval), sum(y_eval==1))

0.0389634306761
(38010, 1481)
0.0401736385162
(38010, 1527)


In [23]:
X_train.max(axis=1)

array([  39205.17      ,   49278.03      ,   67333.77      , ...,
         74028.15      ,   84278.16      ,  117310.97901649])

In [24]:
X_train.min(axis=1)

array([ 0., -1.,  0., ...,  0.,  0.,  0.])

In [25]:
%matplotlib inline  
import matplotlib.pyplot as plt

data_plt = X_train[:,28]
data_plt = data_plt[data_plt>=0]            
data_plt = np.log(data_plt+1)
#data_plt = data_plt[data_plt<25]
plt = hist(data_plt,10)

for j in unique(data_plt):
    print round(j,2), sum(data_plt==j), sum(y_train[data_plt==j])

NameError: name 'hist' is not defined