In [2]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
X_train = pd.read_csv('input/train.csv', header=None, prefix='col')
X_test = pd.read_csv('input/test.csv', header=None, prefix='col') 
y_train = pd.read_csv('input/train-target.csv', header=None, names=['target']) 
subm = pd.read_csv('input/sample-submission.csv', header=None) 

df = pd.concat([X_train, y_train], axis=1)

# Preproc

In [34]:
drop_cols = ['col17', 'col22', 'col26', 'col9', 'col15', 'col16']
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

# Model

In [5]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import time

In [93]:
def train_model(X, X_test, y, params, folds, model_type='lgb', plot_feature_importance=False, 
                averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=20000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=1000,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = [f'col{ncol}' for ncol in range(X.shape[-1])]
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                   by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    
    else:
        return oof, prediction, scores

In [59]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [100]:
params = {'boost': 'gbdt',
#           'feature_fraction': 0.5,
          'learning_rate': 0.01,
          'max_depth': -1,  
          'metric':'auc',
          'min_data_in_leaf': 21,
          'num_leaves': 150,
          'num_threads': -1,
          'verbosity': 1,
          'objective': 'binary'
         }

In [101]:
# CV mean score: 0.8245, std: 0.0118.

X = X_train.values
y = y_train.values.flatten()
X_test_np = X_test.values

oof_lgb, prediction_lgb, scores = train_model(X, X_test_np, y, params=params, 
                                              folds=folds, model_type='lgb', 
                                              plot_feature_importance=False)

Fold 0 started at Sun Aug  2 18:07:05 2020
Training until validation scores don't improve for 200 rounds
[1000]	training's auc: 1	valid_1's auc: 0.822014
Early stopping, best iteration is:
[1019]	training's auc: 1	valid_1's auc: 0.822207
Fold 1 started at Sun Aug  2 18:07:22 2020
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[419]	training's auc: 0.998865	valid_1's auc: 0.825783
Fold 2 started at Sun Aug  2 18:07:31 2020
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[345]	training's auc: 0.99559	valid_1's auc: 0.841751
Fold 3 started at Sun Aug  2 18:07:39 2020
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[673]	training's auc: 1	valid_1's auc: 0.804916
Fold 4 started at Sun Aug  2 18:07:52 2020
Training until validation scores don't improve for 200 rounds
[1000]	training's auc: 1	valid_1's auc: 0.827758
Early stopping, best iteration 

In [102]:
roc_auc_score(y, oof_lgb)

0.822086529882569

In [103]:
prediction_lgb

array([0.09219256, 0.36178835, 0.12861747, ..., 0.2321156 , 0.16645781,
       0.84054615])

In [104]:
subm.iloc[:, 0] = prediction_lgb

In [105]:
subm.to_csv('input/subm004.csv', header=None, index=None)