In [1]:
import numpy as np 
import pandas as pd
import pandas_profiling

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [4]:
train = pd.read_csv("../data/train.csv", delimiter='\t')
test = pd.read_csv("../data/test.csv", delimiter='\t')

In [17]:
train.shape

(30500, 347)

In [16]:
test.shape

(4166, 347)

In [6]:
prof_report = pandas_profiling.ProfileReport(train)

In [7]:
prof_report.to_file("pp_report.html")

In [9]:
rejected_features = prof_report.get_rejected_variables()

In [49]:
test.shape[0] /train.shape[0] 

0.13659016393442622

# Validation

In [47]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [27]:
import lightgbm as lgbm

In [109]:
# Validation

new_train, valid = train_test_split(train, test_size=0.14, random_state=42)

In [718]:
train = train.drop(rejected_features, axis=1)

In [110]:
cv_idx = valid.index

In [719]:
X_train = train.drop(cv_idx)
X_train.drop('0', axis=1, inplace=True)

y_train = train.drop(cv_idx)['0']

In [707]:
rejected_features

['117',
 '119',
 '122',
 '146',
 '148',
 '153',
 '158',
 '165',
 '179',
 '180',
 '206',
 '323',
 '324',
 '325',
 '326',
 '327',
 '333',
 '334',
 '335',
 '341',
 '9']

In [708]:
# V2

X_train = X_train.drop(rejected_features, axis=1)

In [720]:
features = train.drop('0', axis=1).columns

In [712]:
features = train.drop(rejected_features, axis=1)
features = train.drop('0', axis=1).columns

In [714]:
lgbm_model = lgbm.LGBMClassifier(
    n_estimators=300, 
    colsample_bytree=0.6, 
    subsample=0.7,
    learning_rate=0.005, 
    subsample_for_bin=300000,
    num_leaves=21
)

In [715]:
lgbm_model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.6,
        learning_rate=0.005, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,
        n_jobs=-1, num_leaves=21, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.7,
        subsample_for_bin=300000, subsample_freq=1)

In [716]:
cv_pred = lgbm_model.predict_proba(train.loc[cv_idx, features])[:, 1]

ValueError: Number of features of the model must match the input. Model n_features_ is 325 and input n_features is 346 

In [717]:
print("CV roc_auc: {:.6f}".format(roc_auc_score(train.loc[cv_idx, '0'], cv_pred)))

CV roc_auc: 0.740545


In [434]:
train_pred = lgbm_model.predict_proba(train.drop(cv_idx, 0)[features])[:, 1]

In [435]:
print("Train roc_auc: {:.6f}".format(roc_auc_score(train.drop(cv_idx, 0)['0'], train_pred)))

Train roc_auc: 0.744921


In [448]:
params = "n_estimators=300, colsample_bytree=0.6, subsample=0.7, learning_rate=0.005, subsample_for_bin=300000, num_leaves=21"

In [449]:
model = []

In [425]:
cv_scores = []
train_scores = []
lb_scores = []

In [450]:
model.append(params)

In [436]:
cv_scores.append(roc_auc_score(train.loc[cv_idx, '0'], cv_pred))
train_scores.append(roc_auc_score(train.drop(cv_idx, 0)['0'], train_pred))

In [440]:
lb_scores.append(0.73535022)

In [452]:
results = pd.DataFrame()

In [453]:
results['cv_score'] = cv_scores

In [454]:
results['train_score'] = train_scores

In [455]:
results['public_lb'] = lb_scores

In [458]:
results['model'] = model

ValueError: Length of values does not match length of index

In [403]:
pred = (lgbm_model.predict_proba(test[features])[:, 1])

In [447]:
results

Unnamed: 0,cv_score,train_score
0,0.741477,0.763475
1,0.736375,0.744921


# XGBoost

max_depth=5, n_estimators=200, learning_rate=0.065

In [459]:
import xgboost as xgb

In [696]:
xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.02,
    n_jobs=4,
    max_depth = 3,

    colsample_bytree = 0.75,
    colsample_bylevel = 0.7,
    
    max_leaves = 32,
    
    tree_method = 'hist',
    grow_policy = 'lossguide'
)

In [697]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=0.75, gamma=0, grow_policy='lossguide',
       learning_rate=0.02, max_delta_step=0, max_depth=3, max_leaves=32,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=4,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, tree_method='hist')

In [698]:
cv_pred = xgb_model.predict_proba(train.loc[cv_idx, features])[:, 1]

print("XGB CV roc_auc: {:.6f}".format(roc_auc_score(train.loc[cv_idx, '0'], cv_pred)))

train_pred =  xgb_model.predict_proba(train.drop(cv_idx, 0)[features])[:, 1]

print("XGB Train roc_auc: {:.6f}".format(roc_auc_score(train.drop(cv_idx, 0)['0'], train_pred)))

XGB CV roc_auc: 0.740545
XGB Train roc_auc: 0.797018


In [699]:
pred = (xgb_model.predict_proba(test[features])[:, 1])

Submission

In [700]:
submission = pd.DataFrame(test.iloc[:, 0])

In [701]:
submission.rename(index=str, columns={'Unnamed: 0': '_ID_'}, inplace=True)

In [702]:
submission['_VAL_'] = pred

In [703]:
submission

Unnamed: 0,_ID_,_VAL_
0,0,0.148049
1,1,0.430198
2,2,0.212835
3,3,0.305620
4,4,0.517141
5,5,0.282242
6,6,0.092889
7,7,0.145108
8,8,0.111738
9,9,0.316680


In [704]:
submission.to_csv("impr_xgb3.csv", index=False)

In [None]:
# lgbm: cv - 0.736375, pl - 0.73535022
# xgb: cv - 0.739591, train - 0.744921, lb - 0.76068500
# 0.741669 - 0.822175
"""
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
       colsample_bytree=0.7, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
"""

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.01,
    n_jobs=4,
    max_depth = 3,
    
    colsample_bytree = 0.7,
    colsample_bylevel = 0.7
)
XGB CV roc_auc: 0.736033
XGB Train roc_auc: 0.743417
LB: 0.72961263

xgb_model = xgb.XGBClassifier(
    n_estimators=363,
    learning_rate=0.02,
    n_jobs=4,
    max_depth = 3,

    colsample_bytree = 0.7,
    colsample_bylevel = 0.7,
    
    max_leaves = 32,
    
    tree_method = 'hist',
    grow_policy = 'lossguide'
)

XGB CV roc_auc: 0.739078
XGB Train roc_auc: 0.755951
LB: 0.73779452

xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.02,
    n_jobs=4,
    max_depth = 3,

    colsample_bytree = 0.75,
    colsample_bylevel = 0.7,
    
    max_leaves = 32,
    
    tree_method = 'hist',
    grow_policy = 'lossguide'
)

XGB CV roc_auc: 0.740545
XGB Train roc_auc: 0.797018