In [2]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [247]:
X_train = pd.read_csv('input/train.csv', header=None, prefix='col')
X_test = pd.read_csv('input/test.csv', header=None, prefix='col') 
y_train = pd.read_csv('input/train-target.csv', header=None, names=['target']) 
subm = pd.read_csv('input/sample-submission.csv', header=None) 

df = pd.concat([X_train, y_train], axis=1)

# Preproc

In [248]:
drop_cols = ['col17', 'col22', 'col26', 'col9', 'col15', 'col16']
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

In [249]:
from sklearn.decomposition import FastICA

pca = FastICA(n_components=9, random_state=12, max_iter=1500)
X_train_ica = pca.fit_transform(X_train)
X_test_ica = pca.transform(X_test)

In [267]:
X = np.concatenate([X_train.values, X_train_ica], axis=1)
y = y_train.values.flatten()
X_test_np = np.concatenate([X_test.values, X_test_ica], axis=1)

# Model

In [141]:
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import time

In [170]:
def train_model(X, X_test, y, params, folds, model_type='lgb', plot_feature_importance=False, 
                averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train, params={'verbose': -1})
            valid_data = lgb.Dataset(X_valid, label=y_valid, params={'verbose': -1})
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=20000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=False,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = [f'col{ncol}' for ncol in range(X.shape[-1])]
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                   by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    
    else:
        return oof, prediction, scores

In [262]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
# folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
# folds = TimeSeriesSplit(n_splits=n_fold)

In [263]:
params = {'boost': 'gbdt',
#           'feature_fraction': 0.5,
          'learning_rate': 0.01,
          'max_depth': -1,  
          'metric':'auc',
          'min_data_in_leaf': 21,
          'num_leaves': 251,
          'num_threads': -1,
          'verbosity': -1,
          'objective': 'binary',
         }

In [268]:
# CV mean score: 0.8263, std: 0.0107.

oof_lgb, prediction_lgb, scores = train_model(X, X_test_np, y, params=params, 
                                              folds=folds, model_type='lgb', 
                                              plot_feature_importance=False)

Fold 0 started at Mon Aug  3 11:23:29 2020
Fold 1 started at Mon Aug  3 11:24:19 2020
Fold 2 started at Mon Aug  3 11:24:54 2020
Fold 3 started at Mon Aug  3 11:25:15 2020
Fold 4 started at Mon Aug  3 11:25:37 2020
CV mean score: 0.8355, std: 0.0115.


In [269]:
roc_auc_score(y, oof_lgb)

0.8328676915546293

In [302]:
params = {'boost': 'gbdt', 
          'feature_fraction': 0.75, 
          'lambda_l2': 0.6, 
          'learning_rate': 0.01, 
          'max_depth': 15, 
          'metric': 'auc', 
          'min_data_in_leaf': 31, 
          'num_leaves': 131, 
          'num_threads': -1, 
          'objective': 'binary', 
          'verbosity': -1}

In [303]:
# CV mean score: 0.8356, std: 0.0131.

oof_lgb, prediction_lgb, scores = train_model(X, X_test_np, y, params=params, 
                                              folds=folds, model_type='lgb', 
                                              plot_feature_importance=False)

Fold 0 started at Mon Aug  3 11:54:39 2020
Fold 1 started at Mon Aug  3 11:54:54 2020
Fold 2 started at Mon Aug  3 11:55:10 2020
Fold 3 started at Mon Aug  3 11:55:19 2020
Fold 4 started at Mon Aug  3 11:55:37 2020
CV mean score: 0.8356, std: 0.0131.


In [304]:
roc_auc_score(y, oof_lgb)

0.8322237845625459

# Hyperopt

In [254]:
from hyperopt import hp

In [251]:
X = np.concatenate([X_train.values, X_train_ica], axis=1)
y = y_train.values.flatten()
X_test_np = np.concatenate([X_test.values, X_test_ica], axis=1)

In [None]:
params = {'boost': 'gbdt',
#           'feature_fraction': 0.5,
          'learning_rate': 0.01,
          'max_depth': -1,  
          'metric':'auc',
          'min_data_in_leaf': 21,
          'num_leaves': 251,
          'num_threads': -1,
          'verbosity': -1,
          'objective': 'binary',
         }

In [260]:
def to_max(hp_params):
    oof_lgb, prediction_lgb, scores = train_model(X, X_test_np, y, params=hp_params, 
                                              folds=folds, model_type='lgb', 
                                              plot_feature_importance=False)
    return -np.mean(scores)

lgb_params = {'boost': 'gbdt',
          'feature_fraction': hp.choice('feature_fraction',    np.arange(0.05, 1.01, 0.05)),
          'learning_rate': 0.01,
          'max_depth': hp.choice('max_depth', [-1, 7, 11, 15, 21]),  
          'metric': 'auc',
          'min_data_in_leaf': hp.choice('min_data_in_leaf',    np.arange(11, 151, 5)),
          'num_leaves': hp.choice('num_leaves',    np.arange(51, 351, 10)),
          'num_threads': -1,
          'verbosity': -1,
          'lambda_l2': hp.choice('lambda_l2',    np.arange(0, 3, 0.1)),
          'objective': 'binary',
         }

In [261]:
# minimize the objective over the space
from hyperopt import fmin, tpe, space_eval
best = fmin(to_max, lgb_params, algo=tpe.suggest, max_evals=100)

print(best)
print(space_eval(lgb_params, best))

Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:16:06 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:16:15 2020                                                                                               
Fold                                    

1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:17:47 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:17:51 2020                                                                                               
Fold                                                                                                                   
3                                       

started at                                                                                                             
Mon Aug  3 10:19:05 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:19:08 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                              

Mon Aug  3 10:20:19 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:20:21 2020                                                                                               
CV mean score: 0.8293, std: 0.0123.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                              

CV mean score: 0.8164, std: 0.0119.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:21:48 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:21:58 2020                

Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:23:25 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:23:31 2020                                                                                               
Fold                                    

2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:24:53 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:24:57 2020                                                                                               
Fold                                                                                                                   
4                                       

started at                                                                                                             
Mon Aug  3 10:26:44 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:26:49 2020                                                                                               
CV mean score: 0.8347, std: 0.0127.                                                                                    
Fold                                                                                                                   
0                                       

Mon Aug  3 10:28:54 2020                                                                                               
CV mean score: 0.8327, std: 0.0120.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:29:00 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                              

Mon Aug  3 10:30:35 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:30:46 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:30:58 2020                

Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:33:03 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:33:10 2020                                                                                               
Fold                                    

3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:35:07 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:35:11 2020                                                                                               
CV mean score: 0.8343, std: 0.0127.                                                                                    
Fold                                    

started at                                                                                                             
Mon Aug  3 10:36:09 2020                                                                                               
CV mean score: 0.8290, std: 0.0119.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:36:14 2020                                                                                               
Fold                                                                                                                   
1                                       

started at                                                                                                             
Mon Aug  3 10:37:20 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:37:28 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                              

Mon Aug  3 10:39:52 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:40:02 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:40:10 2020                

Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:43:01 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:43:15 2020                                                                                               
CV mean score: 0.8341, std: 0.0122.     

4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:45:27 2020                                                                                               
CV mean score: 0.8330, std: 0.0121.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:45:32 2020                                                                                               
Fold                                    

0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:47:10 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:47:24 2020                                                                                               
Fold                                                                                                                   
2                                       

started at                                                                                                             
Mon Aug  3 10:48:58 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:49:03 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                              

Mon Aug  3 10:50:19 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:50:26 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:50:39 2020                

Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:52:28 2020                                                                                               
CV mean score: 0.8255, std: 0.0119.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:52:33 2020                

Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:54:36 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:54:42 2020                                                                                               
Fold                                    

1                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:56:22 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:56:30 2020                                                                                               
Fold                                                                                                                   
3                                       

started at                                                                                                             
Mon Aug  3 10:58:29 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 10:58:33 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                              

Mon Aug  3 11:00:50 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:00:54 2020                                                                                               
CV mean score: 0.8353, std: 0.0120.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                              

CV mean score: 0.8353, std: 0.0120.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:02:53 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:02:59 2020                

Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:04:39 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:04:47 2020                                                                                               
Fold                                    

2                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:06:33 2020                                                                                               
Fold                                                                                                                   
3                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:06:38 2020                                                                                               
Fold                                                                                                                   
4                                       

started at                                                                                                             
Mon Aug  3 11:09:34 2020                                                                                               
Fold                                                                                                                   
4                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:09:42 2020                                                                                               
CV mean score: 0.8353, std: 0.0121.                                                                                    
Fold                                                                                                                   
0                                       

Mon Aug  3 11:11:40 2020                                                                                               
CV mean score: 0.8322, std: 0.0120.                                                                                    
Fold                                                                                                                   
0                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:11:45 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                              

Mon Aug  3 11:14:14 2020                                                                                               
Fold                                                                                                                   
1                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:14:17 2020                                                                                               
Fold                                                                                                                   
2                                                                                                                      
started at                                                                                                             
Mon Aug  3 11:14:20 2020                

In [272]:
print(space_eval(lgb_params, best))

{'boost': 'gbdt', 'feature_fraction': 0.7500000000000001, 'lambda_l2': 0.6000000000000001, 'learning_rate': 0.01, 'max_depth': 15, 'metric': 'auc', 'min_data_in_leaf': 31, 'num_leaves': 131, 'num_threads': -1, 'objective': 'binary', 'verbosity': -1}


In [None]:
np.arange(0.05, 1.01, 0.05)[14]

# Saving

In [305]:
prediction_lgb

array([0.13053734, 0.21377889, 0.17430052, ..., 0.11894545, 0.06074816,
       0.91159809])

In [306]:
subm.iloc[:, 0] = prediction_lgb

In [318]:
subm.to_csv('input/subm008.csv', header=None, index=None)

In [313]:
subm_last = pd.read_csv('input/subm007.csv', header=None)

In [319]:
((subm + subm_last) / 2).to_csv('input/subm009.csv', header=None, index=None)