In [1]:
import pandas as pd
import numpy as np


from sklearn import ensemble
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.cross_validation import train_test_split



In [2]:
train = pd.read_csv('./train.csv', sep='\t')
test = pd.read_csv('./test.csv', sep='\t')

In [3]:
features = list(train.columns)

#remove index and target
features.remove('Unnamed: 0')
features.remove('0')
features.remove('160')
features.remove('164')

print('Length of features {}'.format(len(features)))

Length of features 343


In [12]:
# removing features which have high percent of zeros
for col in features:
    if max(train[col].value_counts(normalize=True)) <= 0.98:
        features.remove(col)

#some information
print('Length of features {}'.format(len(features)))

Length of features 238


In [13]:
# removing correlated features
for col1 in features:
    for col2 in features:
        if col1 != col2 and abs(train[col1].corr(train[col2])) > 0.9:
            print('{} is highly correlated with {}, corr = {}'.format(col1, col2, train[col1].corr(train[col2])))
            features.remove(col2)
print('Length of features {}'.format(len(features)))

17 is highly correlated with 153, corr = 1.0
26 is highly correlated with 180, corr = 0.9938509737758875
115 is highly correlated with 119, corr = 0.9322885796273787
322 is highly correlated with 326, corr = -0.9653174982983618
324 is highly correlated with 334, corr = 0.951960389387698
Length of features 233


In [4]:
X = train[features] 
y = train['0']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gb = ensemble.GradientBoostingRegressor()


parameters = {
    'n_estimators': [100], 
    'min_samples_split': [10], 
    'max_depth': [5, 10]

}


grid_search = GridSearchCV(
    estimator=gb, 
    param_grid=parameters, 
    scoring ='roc_auc', 
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search = grid_search.fit(X_train, y_train)

print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.best_params_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] min_samples_split=10, max_depth=5, n_estimators=100 .............
[CV] min_samples_split=10, max_depth=5, n_estimators=100 .............
[CV] min_samples_split=10, max_depth=5, n_estimators=100 .............
[CV] min_samples_split=10, max_depth=5, n_estimators=100 .............
[CV]  min_samples_split=10, max_depth=5, n_estimators=100, total= 1.4min
[CV] min_samples_split=10, max_depth=5, n_estimators=100 .............
[CV]  min_samples_split=10, max_depth=5, n_estimators=100, total= 1.5min
[CV] min_samples_split=10, max_depth=10, n_estimators=100 ............
[CV]  min_samples_split=10, max_depth=5, n_estimators=100, total= 1.5min
[CV] min_samples_split=10, max_depth=10, n_estimators=100 ............
[CV]  min_samples_split=10, max_depth=5, n_estimators=100, total= 1.5min
[CV] min_samples_split=10, max_depth=10, n_estimators=100 ............
[CV]  min_samples_split=10, max_depth=5, n_estimators=100, total= 1.6min
[CV] mi

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.4min finished


0.7233824869005818
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
{'min_samples_split': 10, 'max_depth': 5, 'n_estimators': 100}
CPU times: user 1min 11s, sys: 384 ms, total: 1min 11s
Wall time: 10min 34s


In [6]:
from sklearn.metrics import roc_auc_score

y_pred = grid_search.best_estimator_.predict(X_val)

print('Test Score : {}'.format(roc_auc_score(y_val, y_pred)))

Test Score : 0.7240840321338295


In [7]:
X_test = test[features]
y_pred_test = grid_search.best_estimator_.predict(X_test)

pred_df = pd.DataFrame(test['Unnamed: 0'])
pred_df['_VAL_'] = y_pred_test
pred_df.columns = ['_ID_', '_VAL_']
pred_df.to_csv('gb_regressor.csv', index=False)

Validation Score : **0.7130867295807001**  
Test Score :       **0.7099651776306071**  
LB Score :         **0.71820161**

* all features  
Validation Score : **0.7233824869005818**  
Test Score :       **0.7240840321338295**  
LB score : **0.73546356**
* non-zeroes features  
Validation Score : **0.7220223958083952**  
Test Score :       **0.7250863063812483**