In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
print("scikit-learn version: {}". format(sklearn.__version__))

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    i=0
    for filename in filenames:
        if i in range(15):
            print(os.path.join(dirname, filename))
            i = i+1
        else: break

# Submitting via API - what??? 🤔
This is my first competition using an API for submitting. I spend quite some time to understand the process. In this notebook I share my findings in order to save you time. If you like it, please upvote.


I want to show you how to:
- make real predictions 
- handle the case when you created or removed some features
- make predicitons with a model trained locally or in another Kaggle notebook
- make weighted predictions with multiple models trained with different features

For speed reasons I use the parquet format and reduce the number of columns and rows. To use the parquet data, you will have to add this data source (on the right side: Data -> Add data -> Datasets & search for Ubiquant. And while you are there, don't forget to disable Internet)
Please check out this [great post](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/301724) from Rob Mulla for more information on the parquet format.

## Reading the training data

In [None]:
%%time
col_subset = ['time_id','investment_id','f1','f_2','f_23','f_145','f_153','f_225','f_231'] #reduce the number of columns to read
first_time_id_to_use = 1000 # reduce the number of rows

#df_train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet',columns=col_subset)
df_train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
df_train = df_train.loc[df_train.time_id >= first_time_id_to_use]

In [None]:
display(df_train.shape)
df_train.head()

## Training or loading a model
Let's just train a simple model to get some predictions. No validation, nothing. It's supposed to be about the prediction process after all.

In [None]:
features_model_1 = ['time_id','investment_id','f_145','f_153','f_225','f_231'] # using only a small subset of features here

X = df_train[features_model_1]
print(X.columns)
y = df_train.target

model_1 = LinearRegression()
model_1.fit(X,y)

If you have your model already trained locally or in another Kaggle notebook you can load it. If it was trained in another Kaggle notebook, just add the training notebook to your working notebook (on the right side: Data -> Add data -> Notebook Output Files). Now all output from the training notebook is available to be loaded in this notebook.

Adding a LightGBM:


In [None]:
import pickle
filename = '../input/k/melanie7744/using-lightgbm-for-feature-selection/finalized_model.sav'
loaded_lgbm = pickle.load(open(filename, 'rb'))

features_loaded_lgbm = [col for col in df_train.columns if col.startswith("f")] # this model was trained with all anonymized features
preds = loaded_lgbm.predict(df_train[features_loaded_lgbm]) # this is just a sanity check, the predictions have to be made via the API
print("df_train.shape: ", df_train.shape, "Number of predictions: ", len(preds))

Adding a neural network (tensorflow/keras):

In [None]:
from keras.models import load_model
filename = '../input/ubq-dnn/model.h5'
loaded_NN = load_model(filename)

features_loaded_NN = [col for col in df_train.columns if col.startswith("f")] # this model was trained with all anonymized features
preds = loaded_NN.predict(df_train[features_loaded_NN]) # this is just a sanity check, the predictions have to be made via the API
print("df_train.shape: ", df_train.shape, "Number of predictions: ", len(preds))

## The example test data
Now, let's look at the example test data frame. Pay attention to the columns. It's important to note that it has data for **four time_ids**.

In [None]:
test_df = pd.read_parquet('../input/ubiquant-parquet/example_test.parquet')
test_df

## Submission via API

Finally it's time to have a closer look at the API that is used for submission.

Pay attention to the columns of test_df. If you created or removed features (I did both for this demo) you need to do the same on test_df **within the for loop** and then call your model to predict.

If you look at the output below, you can see that the test data is split by time_id and each time_id is given to your model seperately. So your model will be called as many times as there are time_ids in the test data. This process makes sure that the model cannot use any data from future time_ids.

Edit: 
- for Version 7 of this notebook, I created functions to preprocess the test data and make the predictions. This makes the code below cleaner. 
- Version 8 uses a loaded model to predict
- in Version 9 multiple models, that have been trained with different features, are used. The predicition is averaged. For a simpler approach go back to version 7 or 8 of this notebook.
- Version 10 includes an updated function for weighted predicitions, i.e. assigning different weights to the predicitions from different models
- Version 11 has two loaded models

In [None]:
def preprocess(df, features):
    df['time_id'] = df.row_id.str.split("_", expand=True)[0].astype("int16") #re-create time_id (if none of your models uses time_id this step is not necessary)
    df = df[features]  
    return df
    
    
def make_predictions(models, features, weights, df): 
    preds = [] # empty list to hold the predictions
    sum_w = sum(weights)

    for i,model in enumerate(models):
        df_prepr = preprocess(df, features[i]) # preprocess data with matching features
        pred = model.predict(df_prepr) # predict
        if pred.shape == (df.shape[0],1): # handle different output format of NN predicitions
            pred=pred.flatten()
        preds.append(pred * weights[i]/sum_w) # weigh the predictions and collect them
    
    preds = np.sum(preds, axis=0) # sum the weighted predictions up
    return preds # return prediction from all models

In [None]:
my_models = [model_1, loaded_lgbm, loaded_NN] # list the models to be used
my_features = [features_model_1, features_loaded_lgbm, features_loaded_NN]    # list the MATCHING features 
my_weights = [1, 2, 2] # how much weight is given to each models prediction

# some checks before calling the ubiquant API
assert len(my_models) == len(my_features), "The number of models must match the number of feature sets."
assert len(my_weights) == len(my_models), "For each model there need to be corresponding weights."

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    
    print("test_df as loaded by the API\n") 
    display(test_df.head(), test_df.shape)
    
    sample_prediction_df['target'] = make_predictions(my_models, my_features, my_weights, test_df) # using a custom function for preprocessing and predicting
    env.predict(sample_prediction_df)   # register your predictions
    
    print("Predictions for this time_id\n")
    display(sample_prediction_df)
    print("-----------time_id finished-----------\n\n")

Now you can continue with the normal submission process.