# **✔ Data Loading & Import**

In [None]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from scipy import stats
from pathlib import Path
import seaborn as sns

**Reading as Parquet Low Memory (Fast & Low Mem Use)**
- https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
display(train.info())
display(train.head())

In [None]:
start_mem = train.memory_usage().sum() / 1024**2

for col in train.columns:
    col_type = train[col].dtype

    if col_type != object:
        c_min = train[col].min()
        c_max = train[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                train[col] = train[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                train[col] = train[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                train[col] = train[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                train[col] = train[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                train[col] = train[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                train[col] = train[col].astype(np.float32)
            else:
                train[col] = train[col].astype(np.float64)
    else:
        train[col] = train[col].astype('category')

end_mem = train.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

# **🌠 Simple Insight**

Thanks for https://www.kaggle.com/lucamassaron/eda-target-analysis

In [None]:
obs_by_asset = train.groupby(['investment_id'])['target'].count()

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
obs_by_asset.plot.hist(bins=60)
plt.title("target by asset distribution")
plt.show()

In [None]:
mean_target = train.groupby(['investment_id'])['target'].mean()
mean_mean_target = np.mean(mean_target)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
mean_target.plot.hist(bins=60)
plt.title("mean target distribution")
plt.show()

In [None]:
std_target = train.groupby(['investment_id'])['target'].std()
std_std_target = np.mean(std_target)

fig, ax = plt.subplots(1, 1, figsize=(12,6))
std_target.plot.hist(bins=60)
plt.title("std target distribution")
plt.show()

In [None]:
time2target_mean = train.groupby(['time_id'])['target'].mean()
time2target_std = train.groupby(['time_id'])['target'].std()

_, axes = plt.subplots(1, 1, figsize=(24, 12))
plt.fill_between(
        time2target_mean.index,
        time2target_mean - time2target_std,
        time2target_mean + time2target_std,
        alpha=0.1,
        color="b",
    )
plt.plot(
        time2target_mean.index, time2target_mean, "o-", color="b", label="Training score"
    )
plt.axhline(y=mean_mean_target, color='r', linestyle='--', label="mean")
axes.set_ylabel("target")
axes.set_xlabel("time")
plt.show()

In [None]:
ax = sns.jointplot(x=obs_by_asset, y=mean_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations')
ax.ax_joint.set_ylabel('mean target')
plt.show()

Target values on a specific time have a lot Volatility.  
the less the observations, imply a much more uncertainty in the mean target  
**Hypothesis : Each time has their own rules or pattern.**



" **Strategy**: now your cv strategy should be clear, you have to do groupkfold on the time_id,  
keeping all the assets realtive to a time_id or in train or in validation "  -LUCA MASSARON(https://www.kaggle.com/lucamassaron) -

# **👘 Pipeline & Modeling(LGBM)**

Thank you for https://www.kaggle.com/lonnieqin/ubiquant-market-prediction-with-dnn

In [None]:
train.drop(['row_id'], axis=1, inplace=True)
time = train.pop('time_id')
y = train.pop('target')
display(train.head())
display(y.head())

# **🏃‍♂️ Learning**

In [None]:
%%time
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
n_splits = 5
GKF = GroupKFold(n_splits = 3)
params = {'learning_rate': 0.01, 
          'max_depth': 5, 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 144}

models = []
for index, (train_indices, valid_indices) in enumerate(GKF.split(train, y, groups=time)):
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    train_ds = lgb.Dataset(X_train, label = y_train)
    valid_ds = lgb.Dataset(X_val, label = y_val)
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    print('complete 1')
    model = lgb.train(params, train_ds, 200, valid_ds)
    models.append(model)
    print('complete 2')
    pearson_score = stats.pearsonr(model.predict(X_val).ravel(), y_val.values)[0]
    print('Pearson:', pearson_score)
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
    break

# **👋 Submission**

In [None]:
def inference(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    sample_prediction_df['target'] = inference(models, test_df)
    env.predict(sample_prediction_df) 