In [None]:
import os
import sys
import glob
import random

import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno

from random import choice, choices
from tqdm import tqdm
from itertools import cycle
from scipy.stats import skewnorm


pd.set_option("display.max_columns", None)

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_dist_box(value, title=''):
    c = choice(color_pal)
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw = {"height_ratios": (0.2, 1)}, figsize=(18, 9))
    mean, median = np.mean(value), np.median(value)
    
    sns.boxplot(value, ax=ax_box, color=c)
    #ax_box.axvline(mean, color='r', linestyle='--')
    #ax_box.axvline(median, color='b')
    
    sns.distplot(value, ax=ax_hist, color=c)
    #ax_hist.axvline(mean, color='r', linestyle='--')
    #ax_hist.axvline(median, color='b')
    plt.title(title)
    plt.show()

## Problem Statement

In this competition, you’ll build a model that forecasts an `investment's return rate`.



## Data

Dataset Link here: https://www.kaggle.com/robikscube/ubiquant-parquet

Read about parquet files here: https://databricks.com/glossary/what-is-parquet

5.5GB in size.

This is faster and keeps the dtypes of the original dataset.

In [None]:
%%time
train_df = pd.read_parquet('../input/ubiquant-parquet/train.parquet')

**train.csv**

- `row_id` - A unique identifier for the row.
- `time_id` - The ID code for the time the data was gathered. The time IDs are in order, but the real time between the time IDs is not constant and will likely be shorter for the final private test set than in the training set.
- `investment_id` - The ID code for an investment. Not all investment have data in all time IDs.
- `target` - The target.
- `[f_0:f_299]` - Anonymized features generated from market data.


In [None]:
train_df.head()

### time_id vs investment_id

- In training data we have time_id range between `0` to `1219`.
- We will try to understand time_id vs investment_id in next few cells.
- I am extracting some stats from those two features

In [None]:
print("No of unique investment_id in test : ", train_df.investment_id.nunique())

In [None]:
time_stat_df = train_df.groupby(['investment_id'])['time_id'].agg(['count', 'min', 'max', 'std']).reset_index()
time_stat_df['diff'] = time_stat_df['max'] - time_stat_df['min']
time_stat_df.head()

- Above df exlpaines about unique `investment_id` 
- `count`: number of rows in training data for that investment_id
- `min`: time id when that investment_id started
- `max`: time id when that investment_id ended (1219 is max in training)
- `std`: time_id spred for that investment_id 

In [None]:
plot_dist_box(time_stat_df['count'], title="Number of time_id's per investment_id distribution")

In [None]:
plot_dist_box(time_stat_df['min'], title="Started time_id's per investment_id distribution")

- We can observe that most of the investments started when time=0, some investments started in between 200 to 400 and 600 to 1000

In [None]:
# chcking missing time_ids
print(f"We have {time_stat_df.query('count != diff').shape[0]} no of investment_ids missing at least one time_id out of {time_stat_df.shape[0]}")

In [None]:
time_stat_df['miss_count'] = time_stat_df['diff'] - time_stat_df['count']

plot_dist_box(time_stat_df['miss_count'], title="Missing time_id's per investment_id distribution")

- We can observe that more then 50% investments are having > 100 missing time_ids

In [None]:
fig, ax = plt.subplots(figsize=(30, 5))
train_df.groupby('time_id')['investment_id'].count().plot(color=choice(color_pal))
plt.title("unique investment_id's by time")
plt.show()

- We can observe that lot's of time_ids missing in between time_id 300 to 550.
- And also after time_id 600 investment_ids increase a lot towards the end.

### Missing time_ids

In [None]:
tmp_df = train_df[['time_id', 'investment_id']].copy()
tmp_df['target'] = 0
tmp_df = tmp_df.pivot(index='investment_id', columns=['time_id'])
#tmp_df = tmp_df.loc[tmp_df.isna().sum(axis=1).sort_values().index]

In [None]:
msno.matrix(tmp_df)
plt.show()

- We can observe that lots of investment_id's have missing time_ids 

# Target

In [None]:
plot_dist_box(train_df['target'], 'Target Distribution')
print(f"Target Mean :{train_df['target'].mean()} - Std :{train_df['target'].std()} - Median :{train_df['target'].median()}" )

In [None]:
# target over time
fig, ax = plt.subplots(figsize=(30, 5))
train_df.groupby('time_id')['target'].count().plot()
plt.title("unique investment_ids over time")

fig, ax = plt.subplots(figsize=(30, 10))
ax = train_df.groupby('time_id')['target'].mean().plot()
ax = train_df.groupby('time_id')['target'].std().plot()
ax = train_df.groupby('time_id')['target'].median().plot()
ax.legend(['mean', 'std', 'median'])
plt.title("target mean vs std vs median over time")
plt.show()

- we can observe that when the less number of investment_ids over time more fluctuations in mean, std, median target over time 

In [None]:
# target vs investiment_ids

fig, ax = plt.subplots(figsize=(30, 5))
tmp_df = train_df.groupby('investment_id')['target'].agg(['count', 'min', 'max', 'std', 'mean', 'median']).reset_index()
tmp_df = tmp_df.sort_values('count').reset_index(drop=True)
tmp_df['count'].plot()
plt.title("target count sort by investment_id frequency")

fig, ax = plt.subplots(figsize=(30, 5))
ax = tmp_df['mean'].plot()
ax = tmp_df['median'].plot()
ax.legend(['mean', 'median'])
plt.title('meam vs median over investmet_id')

fig, ax = plt.subplots(figsize=(30, 5))
ax = tmp_df['std'].plot()
plt.title('std over investment_id')
plt.show()

- we can observe that when the less time_id's per investment_id more fluctuations in target.

### Please upvote if like it 🙂 