In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import random
import gc
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,RocCurveDisplay,ConfusionMatrixDisplay,confusion_matrix,roc_auc_score,accuracy_score

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv') 
test = pd.read_csv('../input/g-research-crypto-forecasting/example_test.csv') 
sub = pd.read_csv('../input/g-research-crypto-forecasting/example_sample_submission.csv') 
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')  

# Some Exploratory Data Analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(f"We have {train.shape[0]} rows in the training set while we have {test.shape[0]} in the test set")

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#Check if there'is null values
train.isnull().sum()

In [None]:
(750338/train.shape[0])*100

* we have 750338 rows with null target value!!! which means that 3% of the training set with null target 

In [None]:
#Check if there'is null values
test.isnull().sum()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
asset_details

In [None]:
sub.head()

In [None]:
train[train['timestamp']==1514764860]

In [None]:
# after 60 seconds
train[train['timestamp']==1514764920]

In [None]:
# Target distribution
plt.figure(figsize = (12,6))
sns.distplot(train.Target, kde = True, hist=True)

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'], unit='s')
train['timestamp'] = pd.to_datetime(train['timestamp']).dt.to_period('D')

train['avg_Volume_per_Day'] = train.groupby('timestamp').Volume.transform('mean')
train['avg_Close_per_Day'] = train.groupby('timestamp').Close.transform('mean')
train['avg_Low_per_Day'] = train.groupby('timestamp').Low.transform('mean')
train['avg_High_per_Day'] = train.groupby('timestamp').High.transform('mean')
train['avg_Open_per_Day'] = train.groupby('timestamp').Open.transform('mean')
train['avg_Count_per_Day'] = train.groupby('timestamp').Count.transform('mean')
train['avg_VWAP_per_Day'] = train.groupby('timestamp').VWAP.transform('mean')
train['avg_Target_per_Day'] = train.groupby('timestamp').Target.transform('mean')
train.head(5)

In [None]:
data  = train.drop_duplicates(subset=['timestamp'])

In [None]:
data

* <h3>The data is about historic trades from 2018-01-01 to 2021-09-21 (1360 Days).</h3>

In [None]:
data = data.set_index("timestamp")

In [None]:
moving_average = data['avg_Open_per_Day'].rolling(window=1,       # 1 day window
    center=True,      # puts the average at the center of the window 
).mean()              # compute the mean (could also do median, std, min, max, ...)

ax = data['avg_Open_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_Open_per_Day", legend=False);

In [None]:
moving_average = data['avg_High_per_Day'].rolling(window=1, center=True).mean()             

ax = data['avg_High_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_High_per_Day", legend=False);

In [None]:
moving_average = data['avg_Low_per_Day'].rolling(window=1,center=True).mean()              

ax = data['avg_Low_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_Low_per_Day", legend=False);

In [None]:
moving_average = data['avg_Close_per_Day'].rolling(window=1,center=True).mean()              

ax = data['avg_Close_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_close_per_Day", legend=False);

In [None]:
moving_average = data['avg_VWAP_per_Day'].rolling(window=1,center=True).mean()

ax = data['avg_VWAP_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_VWAP_per_Day", legend=False);

In [None]:
moving_average = data['avg_Volume_per_Day'].rolling(window=1,center=True).mean()              

ax = data['avg_Volume_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_Volume_per_Day", legend=False);

In [None]:
moving_average = data['avg_Target_per_Day'].rolling(window=1,center=True).mean()              

ax = data['avg_Target_per_Day'].plot(style=".", color="0.5",figsize=(24,5))
moving_average.plot(ax=ax, linewidth=3, title="avg_Target_per_Day", legend=False);

In [None]:
corr = train[['Count', 'Open', 'High', 'Low', 'Close','Volume', 'VWAP', 'Target']].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(5)

In [None]:
#scatter plot avg_Close_per_Day/avg_High_per_Day
data.plot.scatter(x="avg_Close_per_Day", y='avg_High_per_Day');

In [None]:
#scatter plot avg_Close_per_Day/avg_Low_per_Day
data.plot.scatter(x="avg_Close_per_Day", y='avg_Low_per_Day');

In [None]:
#scatter plot avg_Close_per_Day/avg_Open_per_Day
data.plot.scatter(x="avg_Close_per_Day", y='avg_Open_per_Day');

In [None]:
#scatter plot avg_Close_per_Day/avg_VWAP_per_Day
data.plot.scatter(x="avg_Close_per_Day", y='avg_VWAP_per_Day');

<b>* Close, High, Low, Volume, Open and VWAP are friends, with a linear relationship 😋

In [None]:
#scatter plot avg_Close_per_Day/avg_Count_per_Day
data.plot.scatter(x="avg_Close_per_Day", y='avg_Count_per_Day');

In [None]:
#scatter plot avg_Volume_per_Day/avg_Target_per_Day
data.plot.scatter(x="avg_Volume_per_Day", y='avg_Target_per_Day');

In [None]:
#drop null values
train = train.dropna()
train_median = train.Target.median()
train_median

# Submission

In [None]:
# make submission with the target median
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
for (test, sample_prediction_df) in iter_test:
    sample_prediction_df['Target'] = train_median
    env.predict(sample_prediction_df)

# In Progress .... 🏂🏻