In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

  from pandas import MultiIndex, Int64Index


In [2]:
# Let's explore the data we have here

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
stores = pd.read_csv("./stores.csv")
transactions = pd.read_csv("./transactions.csv")
h_days = pd.read_csv("./holidays_events.csv")
oil = pd.read_csv("./oil.csv")

In [3]:
train.head(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [4]:
def merge(df, m_df, on = 'date'):
 df = pd.merge(how = 'left', on = on, left = df, right = m_df)

 return df

train = merge(train, oil, "date")
train = merge(train, h_days)
train = merge(train, stores, "store_nbr")
train = merge(train, transactions, ["store_nbr", "date"])

test = merge(test, oil, "date")
test = merge(test, h_days)
test = merge(test, stores, "store_nbr")
test = merge(test, transactions, ["store_nbr", "date"])

train = train.drop(['store_nbr'], axis = 1)
test = test.drop(['store_nbr'], axis = 1)

In [5]:

'2012-03-02' in h_days['date'].unique()

True

In [6]:
check = pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
               utc=True)

def split_date(df):
 all_hdays = h_days['date'].unique()

 df['is_holiday'] = df['date'].apply(lambda x: 1 if x in all_hdays else 0)
 
 df['date'] = pd.to_datetime(df['date'])

 df['day'] = df['date'].apply(lambda x: x.day)
 df['month'] = df['date'].apply(lambda x: x.month)
 df['year'] = df['date'].apply(lambda x: x.year)
 df['quarter'] = df['date'].apply(lambda x: x.quarter)

 df = df.drop(['date'], axis = 1)

 return df

train = split_date(train)
test = split_date(test)

In [7]:
def encode(df):
 le = LabelEncoder()
 cols_to_encode = ["family", "state", "type_holiday", "type_store", "locale", "locale_name", "transferred", "city"]

 for c in cols_to_encode:
  df[c] = le.fit_transform(df[c])
 
 return df

train = encode(train)
test = encode(test)

In [8]:
train["is_holiday"].unique()

array([1, 0], dtype=int64)

In [9]:
train = train.drop(["type_store"], axis = 1)
test = test.drop(["type_store"], axis = 1)

In [10]:
# Fill the missing data:

train.isnull().sum()

id                    0
family                0
sales                 0
onpromotion           0
dcoilwtico       955152
type_holiday          0
locale                0
locale_name           0
description     2551824
transferred           0
city                  0
state                 0
cluster               0
transactions     249117
is_holiday            0
day                   0
month                 0
year                  0
quarter               0
dtype: int64

In [11]:
# Let's predict the missing values (for the oil column)

def predict_oil(df):

 first_copy = df.copy()
 first_copy = first_copy.drop(['description', "transactions"], axis = 1)

 oil_test_data = first_copy[first_copy["dcoilwtico"].isnull()]
 second_copy = first_copy.copy()
 second_copy.dropna(inplace=True)
 
 oil_y_train = second_copy["dcoilwtico"]
 oil_x_train = second_copy.drop(['dcoilwtico'], axis = 1)
 oil_x_test = oil_test_data.drop(['dcoilwtico'], axis = 1)

 print(oil_y_train.shape)
 print(oil_x_train.shape)
 model = LinearRegression()
 model.fit(oil_x_train, oil_y_train)

 oil_y_test = model.predict(oil_x_test)

 oil_y_test = np.round(oil_y_test, decimals = 0)

 print(oil_y_test)

 indices = df[df["dcoilwtico"].isnull()].index

 for fill_index, dataframe_index in enumerate(indices):
  df.loc[dataframe_index, "dcoilwtico"] = oil_y_test[fill_index]

 return df

train = predict_oil(train)
test = predict_oil(test)

(2099196,)
(2099196, 16)
[102. 102. 102. ...  31.  31.  31.]
(21384,)
(21384, 15)
[48. 48. 48. ... 47. 47. 47.]


In [12]:
train.isna().sum()

id                    0
family                0
sales                 0
onpromotion           0
dcoilwtico            0
type_holiday          0
locale                0
locale_name           0
description     2551824
transferred           0
city                  0
state                 0
cluster               0
transactions     249117
is_holiday            0
day                   0
month                 0
year                  0
quarter               0
dtype: int64

In [13]:
train.isna().sum()

id                    0
family                0
sales                 0
onpromotion           0
dcoilwtico            0
type_holiday          0
locale                0
locale_name           0
description     2551824
transferred           0
city                  0
state                 0
cluster               0
transactions     249117
is_holiday            0
day                   0
month                 0
year                  0
quarter               0
dtype: int64

In [14]:
# Let's predict the missing values (for the oil column)

def predict_transactions(df):

 first_copy = df.copy()
 first_copy = first_copy.drop(['description', "dcoilwtico"], axis=1)

 oil_test_data = first_copy[first_copy["transactions"].isnull()]
 second_copy = first_copy.copy()
 second_copy.dropna(inplace=True)

 oil_y_train = second_copy["transactions"]
 oil_x_train = second_copy.drop(['transactions'], axis=1)
 oil_x_test = oil_test_data.drop(['transactions'], axis=1)

 print(oil_y_train.shape)
 print(oil_x_train.shape)
 model = LinearRegression()
 model.fit(oil_x_train, oil_y_train)

 oil_y_test = model.predict(oil_x_test)

 oil_y_test = np.round(oil_y_test, decimals=0)

 print(oil_y_test)

 indices = df[df["transactions"].isnull()].index

 for fill_index, dataframe_index in enumerate(indices):
  df.loc[dataframe_index, "transactions"] = oil_y_test[fill_index]

 return df


train = predict_transactions(train)

(2805231,)
(2805231, 16)
[2140. 2142. 2145. ... 1736. 1738. 1740.]


In [15]:
train = train.drop(['description'], axis = 1)
test = test.drop(['description'], axis = 1)

In [16]:
# Feature engineering

def f_eng(df):
 oil_mean_price = df['dcoilwtico'].describe()["mean"]
 df['after_paycheck'] = df['day'].apply(lambda x: 1 if (15 <= x <= 18) or (29 <= x <= 31) or (0 <= x <= 2) else 0)

 return df

train = f_eng(train)
test = f_eng(test)

In [17]:
train.corr().style.background_gradient("YlOrBr")


Unnamed: 0,id,family,sales,onpromotion,dcoilwtico,type_holiday,locale,locale_name,transferred,city,state,cluster,transactions,is_holiday,day,month,year,quarter,after_paycheck
id,1.0,1.1e-05,0.086102,0.206032,-0.880647,-0.056797,-0.049972,-0.053151,-0.047792,2.8e-05,7.8e-05,-4.4e-05,-0.023349,0.052877,0.004563,0.066683,0.977741,0.065949,-0.001084
family,1.1e-05,1.0,-0.113986,-0.047216,-0.000237,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.001815,0.0,0.0,0.0,0.0,0.0,0.0
sales,0.086102,-0.113986,1.0,0.428241,-0.083383,-0.01787,-0.01164,-0.013896,-0.01443,0.04951,0.068988,0.038537,0.2135,0.013964,-0.011972,0.02025,0.08132,0.018752,0.00686
onpromotion,0.206032,-0.047216,0.428241,1.0,-0.167526,-0.02036,-0.017057,-0.019368,-0.017828,0.00439,0.013109,0.005702,0.026224,0.018989,0.001107,0.0256,0.198913,0.02486,0.002311
dcoilwtico,-0.880647,-0.000237,-0.083383,-0.167526,1.0,0.060471,0.036532,0.057774,0.044715,-0.00019,-0.000259,-0.000141,0.016473,-0.047209,0.002754,0.005756,-0.874675,0.007028,-0.002356
type_holiday,-0.056797,-0.0,-0.01787,-0.02036,0.060471,1.0,0.897414,0.900716,0.944148,-0.0,0.0,-0.0,-0.071515,-0.945771,0.019774,-0.158295,-0.023633,-0.156238,0.07827
locale,-0.049972,-0.0,-0.01164,-0.017057,0.036532,0.897414,1.0,0.865778,0.960054,-0.0,0.0,-0.0,-0.038598,-0.963601,0.048625,-0.128111,-0.023697,-0.13465,0.097988
locale_name,-0.053151,-0.0,-0.013896,-0.019368,0.057774,0.900716,0.865778,1.0,0.922572,-0.0,0.0,0.0,-0.050014,-0.930441,0.034632,-0.088518,-0.034878,-0.09837,0.061258
transferred,-0.047792,-0.0,-0.01443,-0.017828,0.044715,0.944148,0.960054,0.922572,1.0,-0.0,0.0,0.0,-0.048957,-0.99526,0.058192,-0.126011,-0.022134,-0.131039,0.079031
city,2.8e-05,0.0,0.04951,0.00439,-0.00019,-0.0,-0.0,-0.0,-0.0,1.0,0.511076,0.093806,0.258081,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Before training model let's optimize it's memory usage

def reduce_mem_usage(df):
  numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
  start_mem = df.memory_usage().sum() / 1024**2

  for col in df.columns:
    col_type = df[col].dtypes

    if col_type in numerics:
      c_min = df[col].min()
      c_max = df[col].max()

      if str(col_type)[:3] == "int":
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
         df[col] = df[col].astype(np.int8)
    
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
         df[col] = df[col].astype(np.int16)
    
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
         df[col] = df[col].astype(np.int32)
    
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
         df[col] = df[col].astype(np.int64) 

      else:
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
    
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
    
        elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
          df[col] = df[col].axtype(np.float64)
 
  end_mem = df.memory_usage().sum() / 1024**2
 
  print("Memory usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(end_mem, 100 * (start_mem - end_mem) / start_mem))

  return df

train = reduce_mem_usage(train)

Memory usage decreased to 107.78 Mb (72.0% reduction)


In [22]:
# Prepare datasets for the model training

y = train['sales'].round()
X = train.drop(['sales'], axis = 1)

In [23]:
y.shape

(3054348,)

In [24]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6, n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters

    def lgb_eval(learning_rate, num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf, min_sum_hessian_in_leaf, subsample):
        params = {'application': 'binary', 'metric': 'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)

        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed,
                           stratified=True, verbose_eval=200, metrics=['auc'])
        return max(cv_result['auc-mean'])

    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin': (20, 90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf': (0, 100),
                                            'subsample': (0.01, 1.0)}, random_state=200)

    # n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    # init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.

    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

    model_auc = []
    for model in range(len(lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])

    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'], lgbBO.res[pd.Series(model_auc).idxmax()]['params']


opt_params = bayes_parameter_opt_lgb(
    X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=10000)


|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------




[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.687724 -> initscore=0.789501
[LightGBM] [I



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.687724 -> initscore=0.789501
[LightGBM] [I



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 1400366, number of negative: 635866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] Number of positive: 1400365, number of negative: 635867
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2036232, number of used features: 18
[LightGBM] [Info] [binary:Bo

In [26]:
# Print out optimized parameters

opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]["max_depth"] = int(round(opt_params[1]["max_depth"]))
opt_params[1]["min_data_in_leaf"] = int(round(opt_params[1]["min_data_in_leaf"]))
opt_params[1]["max_bin"] = int(round(opt_params[1]["max_bin"]))
opt_params[1]["objective"] = 'binary'
opt_params[1]["metruc"] = 'auc'
opt_params[1]['is_unbalance'] = True
opt_params[1]["boost_from_average"] = False
opt_params = opt_params[1]
opt_params

# b_frac: 0.820
# f_frac: 0.854
# rate: 0.83
# max_bin: 56
# max_depth 27
# min_data_in_leaf: 55
# min_sum_hessian_in_leaf: 45
# num_leaves: 62
# subsample: 0.425

{'bagging_fraction': 0.8192059420307759,
 'feature_fraction': 0.854792829599626,
 'learning_rate': 0.8277896457253541,
 'max_bin': 56,
 'max_depth': 27,
 'min_data_in_leaf': 55,
 'min_sum_hessian_in_leaf': 45.00845042447376,
 'num_leaves': 62,
 'subsample': 0.4252313651032907,
 'objective': 'binary',
 'metruc': 'auc',
 'is_unbalance': True,
 'boost_from_average': False}