# If you like my model, please upvote ⬆️⬆️⬆️

# Table of Contents
1. [Load UMP data](#Load-UMP-data)
2. [EDA](#EDA)
3. [Preprocess](#Preprocess)
    - [Drop Short Investments](#Drop-Short-Investments)
    - [Make TensorFlow Dataset](#Make-TensorFlow-Dataset)
4. [Model](#Model)
    - [Build the Model](#Build-the-Model)
    - [Train the Model](#Train-the-Model)
5. [Predict and Submit](#Predict-and-Submit)

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras import layers

# Load UMP data

The size of the original csv dataset is 18.55 GB.<br>
Thanks to [@Lonnie](https://www.kaggle.com/lonnieqin), we can load smaller [pickle of the dataset](https://www.kaggle.com/datasets/lonnieqin/ubiquant-market-prediction-half-precision-pickle) 📈 <br>

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

## EDA

In [None]:
train.info()

In [None]:
train['target'].hist(bins = 100, figsize = (20,6));

In [None]:
train.groupby(['investment_id'])['time_id'].count().hist(bins = 100, figsize = (16,6));

In [None]:
train.groupby(['time_id'])['investment_id'].count().hist(bins = 100, figsize = (20,6));

# Preprocess

### Drop Short Investments
Let's remove some short investments. They appear to be less representative.<br>
Statistical methods like [IQR](https://towardsdatascience.com/why-1-5-in-iqr-method-of-outlier-detection-5d07fdc82097) don't catch all of investments that may disturb results, so I have dropped 2% of the shortest ones.

In [None]:
short_investments = train.groupby(['investment_id'])['time_id'].count()
short_investments_count = len(short_investments) *0.02
short_investments = short_investments[short_investments < short_investments_count].index
short_investments = train[train['investment_id'].isin(short_investments)].index

### Make TensorFlow Dataset

In [None]:
investment_id = train.pop('investment_id')
time_id = train.pop("time_id")
y = train.pop("target")

In [None]:
def make_dataset(investment_id, feature, time_id, y=None, batch_size=1024):
    if y is not None:
        slices = ((investment_id, feature, time_id), y)
    else:
        slices = ((investment_id, feature, time_id))
        
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
train_ds = make_dataset(investment_id=investment_id, feature=train, time_id=time_id, y=y)

# Model

### Build the Model

In [None]:
def get_model():
    investment_id_input = tf.keras.Input(shape=(1,), dtype=tf.uint16, name='investment_id')
    inv_x = layers.Dense(64, activation='relu', kernel_regularizer='l2')(investment_id_input)
#     inv_x = layers.Dropout(0.2)(inv_x)

    features_input = tf.keras.Input(shape=(300,), dtype=tf.float16, name='features')
    f_x = layers.Dense(512, activation='relu', kernel_regularizer='l2')(features_input)
#     f_x = layers.Dropout(0.25)(f_x)
    f_x = layers.Dense(256, activation='relu', kernel_regularizer='l2')(f_x)
#     f_x = layers.Dropout(0.2)(f_x)

    time_id_input = tf.keras.Input(shape=(1,), dtype=tf.uint16, name='time_id')
    time_x = layers.Dense(64, activation='relu', kernel_regularizer='l2')(time_id_input)
#     time_x = layers.Dropout(0.2)(time_x)

    concatenated = layers.concatenate([inv_x, f_x, time_x], axis=-1)
    output = layers.Dense(1, kernel_regularizer='l2')(concatenated)

    model = tf.keras.models.Model([investment_id_input, features_input, time_id_input], output, name='model_with_time_id')
    
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mse', 'mae', 'mape'])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

### Train the Model

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(f"model_ns.tf", save_best_only=True, save_weights_only=True)
early_stop = tf.keras.callbacks.EarlyStopping(patience=5)

In [None]:
history = model.fit(train_ds, epochs=30, callbacks=[early_stop, checkpoint])  # callbacks=early_stop

In [None]:
model.save_weights(f'ns_{model.name}.tf')

# Predict and Submit

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
    test_time_id = test_df['row_id'].str.split('_', expand=True).get(key=0).astype(int)
    test_ds = make_dataset(investment_id=test_df['investment_id'], feature=test_df[features], time_id=test_time_id)
    
    sample_prediction_df['target'] = model.predict([test_df['investment_id'], test_df[features], test_time_id])[:, 0]
    env.predict(sample_prediction_df) 