In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import gc

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
print("scikit-learn version: {}". format(sklearn.__version__))

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation, record_evaluation
print("LightGBM version:  {}".format(lgb.__version__))

from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pickle

# Feature Selection with LightGBM
The Ubiquant competition's training data is huge. Besides using one of the reduced-size data sets, e.g. [this parquet version](http://www.kaggle.com/robikscube/ubiquant-parquet?select=train_low_mem.parquet) or [this pickle version](http://www.kaggle.com/lonnieqin/ubiquant-market-prediction-half-precision-pickle), feature reduction will be helpful in speeding up training. And faster training means faster iteration & more experiments you can run, right?

In this notebook I will show how to use LightGBM's build-in feature importance ranking in order to reduce the total number of features and compare the training times.

In [None]:
# read the training data
df_train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')

In [None]:
display(df_train.shape)
display(df_train.info())
df_train.head()

In [None]:
first_time_id_to_use = 500 
features_to_use= [col for col in df_train.columns if col.startswith("f")] # use only the anonymised features
time_id_to_split_train_and_val = 1000


df_train = df_train.loc[df_train.time_id >= first_time_id_to_use]
print("df_train.shape: ",df_train.shape)

X_train = df_train.loc[df_train.time_id < time_id_to_split_train_and_val]
X_val = df_train.loc[df_train.time_id >= time_id_to_split_train_and_val]
y_train = X_train.target
y_val = X_val.target
X_train = X_train[features_to_use]
X_val = X_val[features_to_use]
print("X_train.shape:  ", X_train.shape)
print("X_val.shape:    ", X_val.shape)
#print("Features used: ", list(X_train.columns))

In [None]:
del df_train #free up memory
gc.collect()

In [None]:
# create lgbm datasets
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [None]:
# these parameters not tuned yet
lgb_params = {'objective': 'regression',
    'metric': 'MSE',
    'boosting_type': 'gbdt',
    'lambda_l1': 2.3e-05,
    'lambda_l2': 0.1,
    'num_leaves': 4,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.9,
    'bagging_freq': 7,
    'min_child_samples': 20,
    'num_iterations': 1000
             }

In [None]:
ts = time.time()

metric_over_time = {} # dict for logging the evaluation metrics

model = lgb.train(        
        lgb_params, 
        dtrain, 
        valid_sets=[dtrain, dval],
        valid_names=['train','val'],
        callbacks=[early_stopping(100), log_evaluation(100), record_evaluation(metric_over_time)]
    )


execution_time = time.time() - ts
print("\nTraining time: " + str(round(execution_time,3)) + "s")

In [None]:
y_val_hat = model.predict(X_val)

# using MSE as a proxy for pearson corellation (https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302181)
print("MSE:  ", mean_squared_error(y_val, y_val_hat, squared=True))
print("RMSE: ", mean_squared_error(y_val, y_val_hat, squared=False))

In [None]:
# check the competition metric
corr, _ = pearsonr(y_val_hat, y_val)
print("Pearson Correlation Coeficient Validation Data: ", corr)

Here we can see how the metric improves as the training goes on. The mean squared error, also called square loss or l2, is decreasing steadily.

In [None]:
lgb.plot_metric(metric_over_time, figsize=(10,5))
plt.show()

LightGBM has a nice build in function for plotting the feature importance. Feature importance can be displayed as "gain", showing the total gains of splits which use the feature, or "split", showing the numbers of times the feature is used in a model. 

In the graph below, we can see that there are feature which are of little importance to LightGBM. So I will retrain without them.

In [None]:
# let's look at which features lgbm deems important
lgb.plot_importance(model, figsize=(10,40), importance_type='gain', max_num_features=300) # importance_type: gain/split: V7 has 'split'
plt.show()

In [None]:
# there are features with little importance, get rid of them
imp = pd.DataFrame({'Value':model.feature_importance(importance_type='gain'),'Feature':X_train.columns}).sort_values(by="Value",ascending=False).reset_index(drop=True)

#imp.Value.value_counts()
imp = imp[imp.Value>100]  # remove all features with gain lower than 100
new_feature_list = list(imp.Feature)
print("Number of features, new: ", len(new_feature_list))

In [None]:
# save model to disk, it will take up approx. 263kB
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
del model,y_val_hat,dtrain,dval,imp # free up memory

Now it's time to retrain with the reduced feature set.

In [None]:
dtrain = lgb.Dataset(X_train[new_feature_list], label=y_train)
dval = lgb.Dataset(X_val[new_feature_list], label=y_val)

ts = time.time()

model = lgb.train(        
        lgb_params, 
        dtrain, 
        valid_sets=[dtrain, dval],
        valid_names=['train','val'],
        callbacks=[early_stopping(100), log_evaluation(100), record_evaluation(metric_over_time)]
    )


execution_time = time.time() - ts
print("\nTraining time: " + str(round(execution_time,3)) + "s")

In [None]:
# before
# Training time: 176.065s !!!!

In [None]:
y_val_hat = model.predict(X_val[new_feature_list])

# using MSE as a proxy for pearson corellation (https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302181)
print("MSE:  ", mean_squared_error(y_val, y_val_hat, squared=True))
print("RMSE: ", mean_squared_error(y_val, y_val_hat, squared=False))

In [None]:
# before
# MSE:   0.8055005324507704
# RMSE:  0.8974968147301529

In [None]:
corr, _ = pearsonr(y_val_hat, y_val)
print("Pearson Correlation Coeficient Validation Data: ", corr)

In [None]:
# before
# Pearson Correlation Coeficient Validation Data:  0.12600954147083407

So while MSE, RSME and Pearson Correlation Coeficient are comparable between LightGBM using all 300 features and LightGBM using only the more important features, the training time is greatly reduced! 🥳

In [None]:
# save model to disk, it will take up approx. 205kB
filename = 'finalized_model_reduced.sav'
pickle.dump(model, open(filename, 'wb'))

# Submit
I make the submissions using the second model.

For more details on the submission proccess, you can check my other notebook: [Understanding the submission API - for newbies](http://www.kaggle.com/melanie7744/understanding-the-submission-api-for-newbies).

In [None]:
def preprocess(df, features):
    df = df[features]  
    return df
    
def make_predictions(model, df): # using a function here really only makes sense if you use multiple models for prediction and average their results
    pred = model.predict(df)
    return pred

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df = preprocess(test_df, new_feature_list) 
    sample_prediction_df['target'] = make_predictions(model, test_df)  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions
 