In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os

import seaborn as sns
from scipy.stats import pearsonr
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import gc 
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px

In [None]:
DATA_PATH = Path('/kaggle/input/ubiquant-market-prediction')
!ls -ilsh $DATA_PATH

# Overview to the example_sample_submission.csv

An example submission file provided so the publicly accessible copy of the API provides the correct data shape and format.

In [None]:
pd.read_csv(DATA_PATH/"example_sample_submission.csv")

# Overview to the example_test.csv

Random data provided to demonstrate what shape and format of data the API will deliver to your notebook when you submit.

In [None]:
pd.read_csv(DATA_PATH/"example_test.csv")

# Overview to the train.csv

Since train.csv is a 18G dataset! Many techniques could be used to load it fast.
Here are some:

## 1. Dask packages:
   Details could be seen here: https://www.kaggle.com/edwardcrookenden/eda-and-lgbm-baseline
    
   More information about `dask` could be seen here: https://www.kaggle.com/yuliagm/how-to-work-with-big-datasets-on-16g-ram-dask#TIP-#7-Using-Dask
   
## 2. Parquet dataset:
   Details could be seen here: https://www.kaggle.com/robikscube/fast-data-loading-and-low-mem-with-parquet-files?kernelSessionId=85628453
    
   Here we just use the processed dataset which is provied by [robikscube](https://www.kaggle.com/robikscube), Thank you!
   
   You could load this dataset directly in you notebook
   
   Dataset Link here: https://www.kaggle.com/robikscube/ubiquant-parquet

## dataset info
```row_id``` - A unique identifier for the row.

```time_id``` - The ID code for the time the data was gathered. The time IDs are in order, but the real time between the time IDs is not constant and will likely be shorter for the final private test set than in the training set.

```investment_id``` - The ID code for an investment. Not all investment have data in all time IDs.

```target``` - The target.

```[f_0:f_299]``` - Anonymized features generated from market data.

In [None]:
%%time
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
train.sample(5)

In [None]:
# copy from https://www.kaggle.com/edwardcrookenden/eda-and-lgbm-baseline
num_data_points = len(train)
print(f'We have {num_data_points} data points')

# copy from https://www.kaggle.com/edwardcrookenden/eda-and-lgbm-baseline
num_investments = train['investment_id'].nunique()
missing_investments_ids = set(range(3773)) - set(train['investment_id'].unique())
print(f"We have {num_investments} unique investments, ", \
          f"min investment_id: {min(train['investment_id'].unique())}, " \
          f"max investment_id: {max(train['investment_id'].unique())} ")

# copy from https://www.kaggle.com/edwardcrookenden/eda-and-lgbm-baseline
num_time_intervals = train['time_id'].nunique()
missing_investments_ids = set(range(1219)) - set(train['time_id'].unique())
print(f"We have {num_time_intervals} unique time_id, ", \
          f"min time_id: {min(train['time_id'].unique())}, " \
          f"max time_id: {max(train['time_id'].unique())} ")

print(f"Number of features with null values: {train.isna().sum().sum()}")

## overview to some columns except features columns
If you don't familiar with `dask`, we could transform `dask.dataframe` to `pandas.dataframe`.

In [None]:
df_ov = train[['row_id','time_id','investment_id','target']]
df_ov.sample(10)

it seems that the `row_id` is the combinaton of `time_id` and `incestment_id`, now we could check it.

In [None]:
# it validated our idea 
(df_ov['row_id'] != df_ov['time_id'].astype("str") + "_" + df_ov['investment_id'].astype(str)).sum()

### visualize the distribution of `target`, `time_id` and `investment_id`

In [None]:
f, ax = plt.subplots(3,1,figsize=(18,6))
sns.distplot(df_ov['target'], ax=ax[0])
ax[0].set_title('The distribution of target')

sns.distplot(df_ov['time_id'], ax=ax[1])
ax[1].set_title('The distribution of time_id')

sns.distplot(df_ov['investment_id'], ax=ax[2])
ax[2].set_title('The distribution of investment_id')

plt.tight_layout()

1. check whether we have duplicate dataset

In [None]:
df_ov.groupby(['investment_id','time_id']).agg("count").value_counts()

2. Does one investment_id to multiple time_id?

### groupby investment_id and count by time_id
Concluson:

* disorder by investment_id

* maybe the investment_id is random?

* However, the second figure is the cumulative sample numbers ordered by investment_id. Based on p_value and r_value, is it just a coincidence？

In [None]:
f, ax = plt.subplots(2,1,figsize=(20,6))
ax[0].plot(df_ov.groupby(['investment_id'])['time_id'].agg('count'))
ax[0].set_xlabel("investment_id")
ax[0].set_ylabel("samples num")
ax[0].set_title('number of samples per invesment')

tmp = df_ov.groupby(['investment_id'])['time_id'].agg('count')
x = tmp.index.values
y = tmp.cumsum().values
ax[1].plot(x, y)
ax[1].set_xlabel("investment_id")
ax[1].set_ylabel("cumulative samples num")
ax[1].set_title("cumulative samples num ordered by investment_id")


plt.tight_layout()
plt.show()

import numpy as np
from scipy.stats import linregress
from sklearn.metrics import mean_squared_error
# scipy linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)
y_pred = intercept + slope * x

mse = mean_squared_error(y_true=y, y_pred=y_pred, squared=True)
rmse = mean_squared_error(y_true=y, y_pred=y_pred, squared=False)

print('scipy r_value: {:.20f}'.format(r_value))
print('scipy p_value: {:.20f}'.format(p_value))
print('scipy intercept: {:.6f}'.format(intercept))
print('scipy slope: {:.6f}'.format(slope))
print('scipy MSE: {:.6f}'.format(mse))
print('scipy RMSE: {:.6f}'.format(rmse))

In [None]:
f, ax = plt.subplots(2,1,figsize=(20,6))
ax[0].plot(df_ov.groupby(['time_id'])['investment_id'].agg('count'))
ax[0].set_xlabel("time_id")
ax[0].set_ylabel("investment_id count")
ax[0].set_title('groupby time_id and count by investment_id')

tmp = df_ov.groupby(['time_id'])['investment_id'].agg('count')
x = tmp.index.values
y = tmp.cumsum().values
ax[1].plot(x, y)
ax[1].set_xlabel("time_id")
ax[1].set_ylabel("cumulative samples num")
ax[1].set_title("cumulative samples num ordered by investment_id")


plt.tight_layout()
plt.show()

import numpy as np
from scipy.stats import linregress
from sklearn.metrics import mean_squared_error
# scipy linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)
y_pred = intercept + slope * x

mse = mean_squared_error(y_true=y, y_pred=y_pred, squared=True)
rmse = mean_squared_error(y_true=y, y_pred=y_pred, squared=False)

print('scipy r_value: {:.20f}'.format(r_value))
print('scipy p_value: {:.20f}'.format(p_value))
print('scipy intercept: {:.6f}'.format(intercept))
print('scipy slope: {:.6f}'.format(slope))
print('scipy MSE: {:.6f}'.format(mse))
print('scipy RMSE: {:.6f}'.format(rmse))

3. Is the `target` influenced by `time_id`, `investment_id`?

we could see that at some time the `target` could have sharp fluctuations. These fluctuations is seems to correspond
to the the last image 👆. So we also draw it.


It did! Maybe we should have a much more detailed analysis to this fluctuations 

In [None]:
f, ax = plt.subplots(1,1,figsize=(22,4))
ax.plot(df_ov.groupby(['time_id'])['target'].mean(), label='target mean')
ax.plot(df_ov.groupby(['time_id'])['investment_id'].agg('count')/3500, label='groupby time_id and count by investment_id',linestyle='--')
ax.set_xlabel("time_id")
ax.set_ylabel("mean target")
ax.set_xlim(-10, 1250)
ax.legend()
plt.show()

Can't see any difference.

In [None]:
f, ax = plt.subplots(1,1,figsize=(22,4))
ax.plot(df_ov.groupby(['investment_id'])['target'].mean()[:1000], label='target mean')
ax.plot((df_ov.groupby(['investment_id'])['time_id'].agg('count')/1000+0.5)[:1000], label='investment_id count')

ax.set_xlabel("investment_id")
ax.set_ylabel("target")
ax.set_xlim(-10, 1000)
ax.legend()
plt.show()

# features

In [None]:
FEATURES = [f"f_{x}" for x in range(300)]
data = train[['target'] + FEATURES].to_numpy()
del train;gc.collect()
correlation = np.corrcoef(data.T)

## target correlation

In [None]:
f, ax = plt.subplots(1,1,figsize=(24,3))
sns.distplot(correlation[:, -1], ax=ax)
plt.title("target correlation")
plt.show()

## all correlations

In [None]:
fig = px.imshow(correlation)
fig.update_layout(
    title="Correlation matrix",
    width = 800, height = 800,
    autosize = False)
# sns.heatmap(correlation)