## import library

In [1]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats

## Import dataset

In [3]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle(r"C:\Users\USER\Desktop\ubiquant-market-prediction\train.pkl")
train.head()

Unnamed: 0,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,f_6,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0.0,1.0,-0.300781,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,0.965623,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0.0,2.0,-0.231079,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,1.428127,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0.0,6.0,0.568848,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,0.979656,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0.0,7.0,-1.064453,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,0.778096,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0.0,8.0,-0.531738,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,-0.946789,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


In [4]:
investment_id = train.pop("investment_id")
investment_id.head()

print(investment_id)

_ = train.pop("time_id")

y = train.pop("target")
y.head()

0             1.0
1             2.0
2             6.0
3             7.0
4             8.0
            ...  
3141405    3768.0
3141406    3768.0
3141407    3770.0
3141408    3772.0
3141409    3772.0
Name: investment_id, Length: 3141410, dtype: float16


0   -0.300781
1   -0.231079
2    0.568848
3   -1.064453
4   -0.531738
Name: target, dtype: float16

## custom loss function (ccc)

In [2]:
def corr_based_loss(pred_y, true_y):
    pred_mean = tf.math.reduced_mean(pred_y)
    true_mean = tf.math.reduced_mean(true_y)

    pred_std = tf.math.reduced_std(pred_y)
    true_std = tf.math.reduced_std(true_y)

    corr_ = tf.math.reduced_sum((pred_y-pred_mean)*(true_y-true_mean))/(len(pred_y)-1)
    ccc = (2*corr_*pred_std*true_std)/(pred_std**2+true_std**2+(pred_mean-true_mean)**2)
    return ccc
    

## Create a IntegerLookup layer for investment_id input

In [5]:
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
print(investment_id_size)
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
## integerlookup은 
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids})) 
## 일부 전처리 레이어는 훈련데이터의 셈플을 기반으로 계산해야하는 내부 상태가 있음.
## 이러한 전처리 레이어는 '훈련불가능'하기 때문에 훈련중 설정이 되지않음. -> 훈련전에 설정해야함
## 이 단계를 적응(adaptation)이라고 함.


2789


In [7]:
import tensorflow as tf

def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(4096) ## 완벽한 셔플링을 위해서는 셔플링의 크기와 버퍼의 크기가 같아야함
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [28]:
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    ## Embedding()을 사용하기 위해서는 입력 될 각 단어들은 모두 정수 인덱싱이 되어 있어야 합니다.
    # this code uses variables embedding method because one-hot encoding has many computational costs 

    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    corr_loss = 
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model

In [24]:
model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_17 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 integer_lookup (IntegerLookup)  (None, 1)           0           ['input_17[0][0]']               
                                                                                                  
 embedding_7 (Embedding)        (None, 1, 32)        89248       ['integer_lookup[7][0]']         
                                                                                                  
 reshape_7 (Reshape)            (None, 32)           0           ['embedding_7[0][0]']            
                                                                                            

In [29]:
%%time
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
    ## 
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    print("X_train : \n {}".format(X_train))
    print("X_val : \n {}".format(X_val))
    print("investment_id : \n {}".format(investment_id))
    investment_id_train = investment_id[train_indices]
    print("investment_id_train : \n {}".format(investment_id_train))

    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    print("y_train : \n {}".format(y_train))
    print("y_val : \n {}".format(y_val))
 
    investment_id_val = investment_id[valid_indices]
    print("investment_id_val : \n {}".format(investment_id_val))

    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    
    model = get_model()
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    
    model = keras.models.load_model(f"model_{index}") ## this notebook uses k-fold cross validation and the number of this model is 5
    models.append(model)
    
    pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), y_val.values)[0]
    print('Pearson:', pearson_score)
    pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
    plt.title("MSE")
    plt.show()
    pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
    plt.title("MAE")
    plt.show()
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
    break



X_train : 	               f_0       f_1       f_2       f_3       f_4       f_5       f_6  \
0        0.932573  0.113691 -0.402206  0.378386 -0.203938 -0.413469  0.965623   
1        0.810802 -0.514115  0.742368 -0.616673 -0.194255  1.771210  1.428127   
2        0.393974  0.615937  0.567806 -0.607963  0.068883 -1.083155  0.979656   
3       -2.343535 -0.011870  1.874606 -0.606346 -0.586827 -0.815737  0.778096   
6       -1.863797  0.113691  1.573864 -0.598433 -0.569936  0.398784  0.054528   
...           ...       ...       ...       ...       ...       ...       ...   
3141404  0.892171 -1.760851  0.135189 -0.405799 -0.214687  0.142001  1.134768   
3141405  0.093530 -0.720275 -0.345497 -0.438781 -0.166972 -0.437182  1.475746   
3141406 -1.344935 -0.199987 -0.107702 -0.454677 -0.221914 -0.141174 -1.498235   
3141408 -2.565332  0.320301  0.076600  1.380182 -0.155366 -0.689000  0.381069   
3141409 -0.089557  0.190229 -0.548256  0.151205  0.079773  0.447962  1.014983   

              f

## submission

In [None]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0
def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds
def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

: 

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df)