In [4]:
import pandas as pd
import numpy as np
import gc

# Gradient Boosting
import lightgbm as lgb
import xgboost as xgb

# Scikit-learn
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

# Graphics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta

# Hyperparameters distributions
from scipy.stats import randint
from scipy.stats import uniform

# Metrics
from sklearn.metrics import average_precision_score, roc_auc_score, mean_absolute_error

import os
import warnings


In [9]:
santander_data = pd.read_csv('train.csv')
santander_data_test = pd.read_csv('test.csv')


In [10]:
label_df = santander_data['target']

In [11]:
santander_data.drop(['ID_code','target'], axis=1, inplace=True)

santander_data_test.drop('ID_code', axis=1, inplace=True)
santander_data.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [12]:
santander_data.select_dtypes(exclude=np.number).columns

Index([], dtype='object')

In [13]:
len_train = len(santander_data)
len_train

200000

In [14]:
#Merge test and train
merged = pd.concat([santander_data, santander_data_test])
#Saving the list of original features in a new list `original_features`.
original_features = merged.columns
merged.shape

(400000, 200)

In [15]:
idx = features = merged.columns.values[0:200]
for df in [merged]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)

In [16]:
train_df = merged.iloc[:len_train]
train_df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,12.7803,-1.0914,1456.3182,-21.4494,43.1127,7.281591,9.33154,0.10158,1.331023,6.7704
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,18.356,1.9518,1415.3636,-47.3797,40.5632,7.076818,10.33613,-0.351734,4.110215,7.22315
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,14.7222,0.3965,1240.8966,-22.4038,33.882,6.204483,8.753387,-0.056957,0.546438,5.8994
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,17.9697,-8.9996,1288.2319,-35.1659,38.1015,6.44116,9.594064,-0.480116,2.630499,6.7026
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,17.9974,-8.8104,1354.231,-65.4863,41.1037,6.771155,11.287122,-1.463426,9.787399,6.94735


In [17]:
X_test = merged.iloc[len_train:]
X_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,15.4722,-8.7197,1416.6404,-31.9891,42.0248,7.083202,9.910632,-0.088518,1.871262,7.3144
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,5.9739,...,19.1293,-20.976,1249.686,-41.1924,35.602,6.24843,9.541267,-0.559785,3.391068,6.4396
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,19.8956,-23.1794,1430.2599,-34.3488,39.3654,7.1513,9.967466,-0.135084,2.326901,7.26355
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,13.0168,-4.2108,1411.4447,-21.4797,40.3383,7.057224,8.257204,-0.167741,2.253054,6.89675
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,13.926,-9.1846,1423.7364,-24.8254,45.551,7.118682,10.043542,0.293484,2.044943,6.83375


In [18]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [32]:
del santander_data
del santander_data_test
gc.collect()

122

In [20]:
skf_three= StratifiedKFold(n_splits=5, shuffle=False, random_state=23)

In [21]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [22]:
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(len(X_test))
feats = [f for f in train_df.columns]
    
for n_fold, (train_idx, valid_idx) in enumerate(skf_three.split(train_df[feats], label_df)):
    X_train, y_train = train_df.iloc[train_idx][feats], label_df.iloc[train_idx]
    X_valid, y_valid = train_df.iloc[valid_idx][feats], label_df.iloc[valid_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(n_fold + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
        
    clf = lgb.train(param, trn_data,40000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        

    oof_preds[valid_idx] = clf.predict(train_df.iloc[valid_idx][feats], num_iteration=clf.best_iteration)
    sub_preds += clf.predict(X_test[feats], num_iteration=clf.best_iteration) / 5


print('Full AUC score %.6f' % roc_auc_score(label_df, oof_preds))

pred3=sub_preds

Fold idx:1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.897362	valid_1's auc: 0.883352
[2000]	training's auc: 0.903706	valid_1's auc: 0.888257
[3000]	training's auc: 0.908066	valid_1's auc: 0.891693
[4000]	training's auc: 0.911694	valid_1's auc: 0.894252
[5000]	training's auc: 0.914577	valid_1's auc: 0.89572
[6000]	training's auc: 0.917117	valid_1's auc: 0.896854
[7000]	training's auc: 0.919384	valid_1's auc: 0.897752
[8000]	training's auc: 0.92144	valid_1's auc: 0.898346
[9000]	training's auc: 0.923367	valid_1's auc: 0.898771
[10000]	training's auc: 0.925202	valid_1's auc: 0.899045
[11000]	training's auc: 0.926941	valid_1's auc: 0.899121
[12000]	training's auc: 0.928653	valid_1's auc: 0.899245
[13000]	training's auc: 0.93028	valid_1's auc: 0.89931
[14000]	training's auc: 0.93191	valid_1's auc: 0.899357
[15000]	training's auc: 0.933497	valid_1's auc: 0.89935
[16000]	training's auc: 0.935038	valid_1's auc: 0.899304
Early stopping, best iterati

In [24]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['target'] = pred3
sample_submission.to_csv('submission.csv', index=False)

In [28]:
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor

# Forceasting with decompasable model
from sklearn.linear_model import LinearRegression, RidgeCV

In [29]:
droppable_features = []

In [31]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [33]:
(train.isnull().sum()/train.shape[0]).sort_values(ascending=False)

var_199    0.0
var_61     0.0
var_71     0.0
var_70     0.0
var_69     0.0
var_68     0.0
var_67     0.0
var_66     0.0
var_65     0.0
var_64     0.0
var_63     0.0
var_62     0.0
var_60     0.0
var_48     0.0
var_59     0.0
var_58     0.0
var_57     0.0
var_56     0.0
var_55     0.0
var_54     0.0
var_53     0.0
var_52     0.0
var_51     0.0
var_50     0.0
var_72     0.0
var_73     0.0
var_74     0.0
var_75     0.0
var_96     0.0
var_95     0.0
          ... 
var_104    0.0
var_103    0.0
var_102    0.0
var_101    0.0
var_122    0.0
var_124    0.0
var_147    0.0
var_125    0.0
var_146    0.0
var_145    0.0
var_144    0.0
var_143    0.0
var_142    0.0
var_141    0.0
var_140    0.0
var_139    0.0
var_138    0.0
var_137    0.0
var_136    0.0
var_135    0.0
var_134    0.0
var_133    0.0
var_132    0.0
var_131    0.0
var_130    0.0
var_129    0.0
var_128    0.0
var_127    0.0
var_126    0.0
ID_code    0.0
Length: 202, dtype: float64

In [34]:
pd.options.display.float_format = '{:,.4f}'.format
sk_df = pd.DataFrame([{'column': c, 'uniq': train[c].nunique(), 'skewness': train[c].value_counts(normalize=True).values[0] * 100} for c in train.columns])
sk_df = sk_df.sort_values('skewness', ascending=False)
sk_df

Unnamed: 0,column,skewness,uniq
1,target,89.9510,2
70,var_68,0.5420,451
110,var_108,0.1565,8525
128,var_126,0.1525,32411
14,var_12,0.1015,9561
93,var_91,0.0330,7962
105,var_103,0.0305,9376
150,var_148,0.0295,10608
73,var_71,0.0270,13527
163,var_161,0.0260,11071


In [35]:
# Nan Values
null_counts = train.isnull().sum()
null_counts = null_counts / train.shape[0]

In [36]:
null_counts

ID_code   0.0000
target    0.0000
var_0     0.0000
var_1     0.0000
var_2     0.0000
var_3     0.0000
var_4     0.0000
var_5     0.0000
var_6     0.0000
var_7     0.0000
var_8     0.0000
var_9     0.0000
var_10    0.0000
var_11    0.0000
var_12    0.0000
var_13    0.0000
var_14    0.0000
var_15    0.0000
var_16    0.0000
var_17    0.0000
var_18    0.0000
var_19    0.0000
var_20    0.0000
var_21    0.0000
var_22    0.0000
var_23    0.0000
var_24    0.0000
var_25    0.0000
var_26    0.0000
var_27    0.0000
           ...  
var_170   0.0000
var_171   0.0000
var_172   0.0000
var_173   0.0000
var_174   0.0000
var_175   0.0000
var_176   0.0000
var_177   0.0000
var_178   0.0000
var_179   0.0000
var_180   0.0000
var_181   0.0000
var_182   0.0000
var_183   0.0000
var_184   0.0000
var_185   0.0000
var_186   0.0000
var_187   0.0000
var_188   0.0000
var_189   0.0000
var_190   0.0000
var_191   0.0000
var_192   0.0000
var_193   0.0000
var_194   0.0000
var_195   0.0000
var_196   0.0000
var_197   0.00

In [37]:
train_df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,12.7803,-1.0914,1456.3182,-21.4494,43.1127,7.2816,9.3315,0.1016,1.331,6.7704
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,18.356,1.9518,1415.3636,-47.3797,40.5632,7.0768,10.3361,-0.3517,4.1102,7.2232
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,14.7222,0.3965,1240.8966,-22.4038,33.882,6.2045,8.7534,-0.057,0.5464,5.8994
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,17.9697,-8.9996,1288.2319,-35.1659,38.1015,6.4412,9.5941,-0.4801,2.6305,6.7026
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,17.9974,-8.8104,1354.231,-65.4863,41.1037,6.7712,11.2871,-1.4634,9.7874,6.9474


In [43]:
from sklearn import linear_model

In [46]:
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(len(X_test))
feats = [f for f in train_df.columns]
    
for n_fold, (train_idx, valid_idx) in enumerate(skf_three.split(train_df[feats], label_df)):
    X_train, y_train = train_df.iloc[train_idx][feats], label_df.iloc[train_idx]
    X_valid, y_valid = train_df.iloc[valid_idx][feats], label_df.iloc[valid_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(n_fold + 1))
    #trn_data = lgb.Dataset(X_tr, label=y_tr)
    #val_data = lgb.Dataset(X_valid, label=y_valid)
        
    #clf = lgb.train(param, trn_data,40000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    clf =  linear_model.BayesianRidge(alpha_1 = 0.0001,alpha_2 = 0.0001,lambda_1=0.0000001,compute_score=True,n_iter=1750)
    clf.fit(X_tr,y_tr)

    oof_preds[valid_idx] = clf.predict(train_df.iloc[valid_idx][feats])
    sub_preds += clf.predict(X_test[feats]) / 5


print('Full AUC score %.6f' % roc_auc_score(label_df, oof_preds))

pred3=sub_preds

Fold idx:1
Fold idx:2
Fold idx:3
Fold idx:4
Fold idx:5
Full AUC score 0.860042


In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['target'] = pred3
sample_submission.to_csv('submission2.csv', index=False)

In [38]:
X_train, y_train = train_df.iloc[train_idx][feats], label_df.iloc[train_idx]
X_valid, y_valid = train_df.iloc[valid_idx][feats], label_df.iloc[valid_idx]

X_tr, y_tr = augment(X_train.values, y_train.values)
X_tr = pd.DataFrame(X_tr)


In [39]:
X_tr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,12.7803,-1.0914,1456.3182,-21.4494,43.1127,7.2816,9.3315,0.1016,1.331,6.7704
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,18.356,1.9518,1415.3636,-47.3797,40.5632,7.0768,10.3361,-0.3517,4.1102,7.2232
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,14.7222,0.3965,1240.8966,-22.4038,33.882,6.2045,8.7534,-0.057,0.5464,5.8994
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,17.9697,-8.9996,1288.2319,-35.1659,38.1015,6.4412,9.5941,-0.4801,2.6305,6.7026
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,17.9974,-8.8104,1354.231,-65.4863,41.1037,6.7712,11.2871,-1.4634,9.7874,6.9474


0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [47]:
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(len(X_test))
feats = [f for f in train_df.columns]
    
for n_fold, (train_idx, valid_idx) in enumerate(skf_three.split(train_df[feats], label_df)):
    X_train, y_train = train_df.iloc[train_idx][feats], label_df.iloc[train_idx]
    X_valid, y_valid = train_df.iloc[valid_idx][feats], label_df.iloc[valid_idx]
    
    X_tr, y_tr = augment(X_train.values, y_train.values)
    X_tr = pd.DataFrame(X_tr)
    
    print("Fold idx:{}".format(n_fold + 1))
    #trn_data = lgb.Dataset(X_tr, label=y_tr)
    #val_data = lgb.Dataset(X_valid, label=y_valid)
        
    #clf = lgb.train(param, trn_data,40000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    clf =  linear_model.Lasso(alpha=0.001,fit_intercept=True,random_state=23)
    clf.fit(X_tr,y_tr)

    oof_preds[valid_idx] = clf.predict(train_df.iloc[valid_idx][feats])
    sub_preds += clf.predict(X_test[feats]) / 5


print('Full AUC score %.6f' % roc_auc_score(label_df, oof_preds))

pred3=sub_preds

Fold idx:1
Fold idx:2
Fold idx:3
Fold idx:4
Fold idx:5
Full AUC score 0.858785
