# **✔ Data Loading & Import**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns

This notebook introduces dataset and code converted from train.csv to pickle file.
It takes less than a minute to load all the data, and the data size reduce from 18.5GB to 3.6GB.

dataset URL : [https://www.kaggle.com/columbia2131/ump-train-picklefile](https://www.kaggle.com/columbia2131/ump-train-picklefile)

In [None]:
def transform_csv2pickle(path, usecols, dtype):
    train = pd.read_csv(
        path,
        usecols=usecols,
        dtype=dtypes
    )
    train.to_pickle('train.pkl')


path = '../input/ubiquant-market-prediction/train.csv'

basecols = ['row_id', 'time_id', 'investment_id', 'target']
features = [f'f_{i}' for i in range(300)]

dtypes = {
    'row_id': 'str',
    'time_id': 'uint16',
    'investment_id': 'uint16',
    'target': 'float32',
}
for col in features:
    dtypes[col] = 'float32'

# transform_csv2pickle(path, basecols+features, dtypes)

In [None]:
%%time
train = pd.read_pickle('../input/ump-train-picklefile/train.pkl')

In [None]:
start_mem = train.memory_usage().sum() / 1024**2

for col in train.columns:
    col_type = train[col].dtype

    if col_type != object:
        c_min = train[col].min()
        c_max = train[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                train[col] = train[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                train[col] = train[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                train[col] = train[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                train[col] = train[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                train[col] = train[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                train[col] = train[col].astype(np.float32)
            else:
                train[col] = train[col].astype(np.float64)
    else:
        train[col] = train[col].astype('category')

end_mem = train.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

In [None]:
# DATA_PATH = Path('../input/ump-train-picklefile')
# SAMPLE_TEST_PATH = Path('../input/ubiquant-market-prediction')
# !ls $SAMPLE_TEST_PATH

# **🙄 Simple EDA**

**About train...**

In [None]:
display(train.info())
display(train.head())

In [None]:
len(train)

In [None]:
# row_id
# time_id
# investment_id
# target
# f_0 - f_299

train.columns

**Target Mean**

Let's see how target values change

In [None]:
# Groupby time
# target values in Specific time[380 - 550] have large Volatility
dic = {}
for i in range(0, 1220):
    train_time = train['target'][train['time_id'] == i].mean()
    dic[f'{i}'] = train_time


time_df = pd.Series(dic)
del dic

time_df.plot()
plt.show()

In [None]:
# Groupby Investment_id
dic = {}
for i in list(set(list(train['investment_id']))):
    train_invest_id = train['target'][train['investment_id'] == i].mean()
    dic[f'{i}'] = train_invest_id


train_invest_id_df = pd.Series(dic)
del dic

train_invest_id_df.plot()
plt.show()

# **🤞 Correlation**

**Step 1 : just Correlation & Target**

**Check Correlation target & f_0 - f_300**

humm... can't find something special variable. 
they all have just low correlation with target

In [None]:
# Correlation
dic = {}

for i in range(0, 300):
    corr_f = train[['target', f'f_{i}']].corr().iloc[0,1]
    print(f'target & f_{i} Correlation is {corr_f}')
    dic[f'f_{i}'] = corr_f

del dic

**Step 2 : Correlation & Target -> Groupby investment_id**

each investment_id have different target&f_i relationship.

In [None]:
# Correlation - investment_id = 0
dic = {}

for i in range(0, 300):
    corr_f = train[['target', f'f_{i}']][train['investment_id'] == 0].corr().iloc[0,1]
    #print(f'target & f_{i} Correlation is {corr_f}')
    dic[f'f_{i}'] = corr_f

sorted_dict = sorted(dic.items(), key = lambda item: item[1])

plt.figure(figsize=(5,8))
sns.heatmap(pd.DataFrame(sorted_dict[1:10]).set_index(0).head(30), annot=True)

del dic
del sorted_dict

In [None]:
# Correlation - investment_id = 1
dic = {}

for i in range(0, 300):
    corr_f = train[['target', f'f_{i}']][train['investment_id'] == 1].corr().iloc[0,1]
    #print(f'target & f_{i} Correlation is {corr_f}')
    dic[f'f_{i}'] = corr_f

sorted_dict = sorted(dic.items(), key = lambda item: item[1])

plt.figure(figsize=(5,8))
sns.heatmap(pd.DataFrame(sorted_dict[1:10]).set_index(0).head(30), annot=True)

del dic
del sorted_dict

In [None]:
# Correlation - investment_id = 2
dic = {}

for i in range(0, 300):
    corr_f = train[['target', f'f_{i}']][train['investment_id'] == 2].corr().iloc[0,1]
    #print(f'target & f_{i} Correlation is {corr_f}')
    dic[f'f_{i}'] = corr_f

sorted_dict = sorted(dic.items(), key = lambda item: item[1])

plt.figure(figsize=(5,8))
sns.heatmap(pd.DataFrame(sorted_dict[1:10]).set_index(0).head(30), annot=True)

del dic
del sorted_dict

**Step 3 : target's relationship with investment_id**

humm... what the fuxk. 😫😨🤯🥵😱😡👿

In [None]:
target_invest = train[['target', 'investment_id']].copy()
target_invest_corr = target_invest.corr()

sns.set_theme()
sns.heatmap(target_invest_corr, annot=True)

del target_invest
del target_invest_corr

# **Baseline (LGBM & xgboost)**

Ok. Lets go to make Baseline 🎈

In [None]:
import lightgbm
import xgboost
from sklearn.model_selection import train_test_split

In [None]:
x = train.drop(['row_id', 'target'], axis=1).copy()
y = train.target
display(x.head())
display(y.head())

split train/test -> train/val/test

In [None]:
line = len(train)//10
x_test = x[:line]
y_test = y[:line]
x_val = x[line:line*2]
y_val = y[line:line*2]
x_train = x[line*2:]
y_train = y[line*2:]

print(f'train : {len(x_train)} / val : {len(x_val)} / test : {len(x_test)}')

**LGBM model**

In [None]:
train_ds = lightgbm.Dataset(x_train, label = y_train) 
val_ds = lightgbm.Dataset(x_val, label = y_val) 

In [None]:
params = {'learning_rate': 0.01, 
          'max_depth': 5, 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 144}

In [None]:
model = lightgbm.train(params, train_ds, 100, val_ds)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

prediction = model.predict(x_test)
mse = mean_squared_error(y_test, prediction)
print(f'model mse is {mse}')

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 