In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc 
import time
import optuna
import sklearn.metrics
from datetime import datetime 

plt.style.use('ggplot') # Lets make our plots pretty

path = os.getcwd()

n=datetime.now().strftime("%m_%d_%H_%M")
print(str(n))
print(path)  

02_15_19_24
/home/tomita/kaggle/kaggle_elo/Models


In [2]:
# Read in the dataframes
train = pd.read_csv('../input/train_1.csv')
test = pd.read_csv('../input/test_1.csv')

#print(train.columns)

target = train['target']
train_true=np.array(train['target'])
print(train_true.shape)
                    
del train['target']
del train['outliers']
#del train['outliners']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

(201917,)


In [None]:
def kfold_lightgbm(trial):
    FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0','outliers']
    seed=20190208
        
    # params optimized by optuna
    learning_rate_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
    toprate_tuna = trial.suggest_uniform('top_rate', 0, 1.0)
    num_leaves_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
    min_child_weight_tuna = trial.suggest_int('min_child_weight', 5, 500)
    other_rate_tuna=trial.suggest_uniform('other_rate', 0.0, 1.0)
    num_leaves_tuna=trial.suggest_int('num_leaves', 5, 1000)
    min_gain_split_tuna=trial.suggest_uniform('min_gain_split', 5, 500)
    reg_lambda_tuna=trial.suggest_uniform('reg_lambda', 5, 500)
    subsample_tuna = trial.suggest_uniform('sub_sample', 0, 1.0)
    reg_alpha_tuna=trial.suggest_uniform('sub_sample', 0, 20)
    colsample_bytree_tuna = trial.suggest_uniform('colsample_bytree_tuna', 0, 1.0)
    max_depth= trial.suggest_int('max_depth', 5, 100)
    
    param ={'task': 'train',
            'boosting': 'dart',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': learning_rate_tuna ,
            'subsample': subsample_tuna,
            'max_depth': max_depth,
            'top_rate': toprate_tuna ,
            'num_leaves': num_leaves_tuna,
            'min_child_weight': min_child_weight_tuna,
            'other_rate': other_rate_tuna,
            'reg_alpha': reg_alpha_tuna,
            'colsample_bytree':colsample_bytree_tuna  ,
            'min_split_gain': min_gain_split_tuna,
            'reg_lambda': reg_lambda_tuna,
            'min_data_in_leaf': 21,
            'verbose': -1,
            'seed':seed,
            'bagging_seed':seed,
            'drop_seed':seed,
            'max_bin':255,
            'device':'gpu'
            }

    # Create arrays and dataframes to store results
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    train_prdictions = np.zeros(train.shape[0])
    start = time.time()
    feature_importance_df = pd.DataFrame()
    
    # k-fold
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        train_prdictions += clf.predict(train[features], num_iteration=clf.best_iteration) / folds.n_splits
        predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

    n=datetime.now().strftime("%m_%d_%H_%M_%S")
    feature_importance_df.to_csv("../output/feature_importance.csv")
    sub_df = pd.read_csv("../input/sample_submission.csv",engine='python')
    sub_df["target"] = predictions
    sub_df.to_csv("../output/submit_lgb"+str(n)+"_optuna.csv", index=False)
    error_train = sklearn.metrics.mean_absolute_error(train_true,train_prdictions)

    return error_train 


In [None]:
study = optuna.create_study()
study.optimize(kfold_lightgbm, n_trials=100)

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.69927	valid_1's rmse: 3.73597


In [None]:
feature_importance_df=pd.read_csv("../output/feature_importance.csv")
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:20].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
study.best_params

In [None]:
n=10
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("../output/submit_lgb"+str(n)+"_optuna.csv", index=False)