In [None]:
import sys
sys.path.insert(1, "../")

import utils
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import  MinMaxScaler

# Deep learning import
import tensorflow as tf

# Evaluation Imports
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
plt.style.use('default')

DATA_FILE_NAME = '../Data/final_crypto_data.csv'
SELECTED_FEATURES = [
        'reddit_compound_polarity',
		# 'quote_volume_24h',
        # 'volume_change_24h',
        # 'percent_change_1h',
        # 'percent_change_24h',
        # 'percent_change_7d',
		# 'high_24h',
        # 'last_24h',
        # 'bid_24h',
        # 'vwap_24h',
        # 'volume_24h',
        # 'low_24h',
        # 'ask_24h',
        # 'open_24h',
        'volume_1min',
        'high_1min',
        'low_1min',
        'open_1min',
        'close_1min'
    ]

 ### Update Default number of DF columns

In [None]:
print(f"Default number of DF columns displayed {pd.get_option('display.max_columns')}")

# settings to display all columns
pd.set_option("display.max_columns", None)

# Load Dataset
1. load dataset from CSV
2. Make date time as index
3. Drop unnecessary futures

In [None]:
df = pd.read_csv (DATA_FILE_NAME)
df = utils.setDateTimeAsIndex(df)
df = utils.filterColumns(df, SELECTED_FEATURES)
df

# Check dataset for Null Values

In [None]:
# check for null values per column
print("NaN values per column count: \n")
df.isna().sum()

In [None]:
df.columns

In [None]:
# df.drop(['Unnamed: 0', '_id'], axis = 1, inplace = True)
# df

## Describe Dataset

In [None]:
describe = df.describe()
describe

In [None]:
describe.index

In [None]:
describe.columns

## Get Correlation Graph

Check the correlation between features

In [None]:
import seaborn as sns

corrMatrix = df.corr()
display(corrMatrix.head())
plt.figure(figsize =(30,20))
sns.clustermap(corrMatrix, annot = True,  fmt = ".2f")
plt.show()

## Get dataset Monthly summary
- Group dataset by months
- Get the total price observations per month
- Get month OCHL values and mean polarity.

In [None]:
count_df = df.groupby(pd.Grouper(freq='M'))['high_1min']\
        .agg(('count'))\
        .reset_index()\
        .rename(columns={
        'high_1min':'monthly_count'
})
count_df

In [None]:
# assuming your dataframe is called 'df' and the date column is the index
ochl_polarity_df = df.groupby(pd.Grouper(freq='M'))\
        .agg(
        {
        'high_1min': 'max',
        'low_1min': 'min',
        'open_1min': 'first',
        'close_1min': 'last',
        'volume_1min': 'mean',
        'reddit_compound_polarity': 'mean',
        }
).reset_index()
ochl_polarity_df

In [None]:
# Merge 2 DF
monthly_summary_df = pd.merge(count_df, ochl_polarity_df, on='datetime')
monthly_summary_df

In [None]:
values = df.values
i = 1

In [None]:
# plot each column
plt.figure(figsize=(15,9))
for group in range(len(SELECTED_FEATURES)) :
	plt.subplot(len(SELECTED_FEATURES), 1, i)
	plt.plot(values[:, group])
	plt.title(df.columns[group], y=0.5, loc='right')
	i += 1
plt.show()

# Split Dataset To Train, Validation and Test

In [None]:
n_steps = 15
X, y = utils.split_sequence(df['close_1min'], n_steps)
X

In [None]:
train_X, train_y, valid_X, valid_y, test_X, test_y = utils.train_test_valid_split(
	X,
	y,
	train_size=0.8,
	valid_size=0.1
)
print("----------------------------")
print(f"-------- Train X: {train_X.shape}, Train y: {train_y.shape} --------")
print(f"-------- Valid X: {valid_X.shape}, Valid y: {valid_y.shape} --------")
print(f"-------- Test X: {test_X.shape}, Test y: {test_y.shape} --------")
print("----------------------------\n")

In [None]:
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Minute Observations')
plt.ylabel('Closing Price')
plt.plot(df['close_1min'][:len(train_y)], 'green', label='Train data')
plt.plot(df['close_1min'][len(train_y):len(train_y)+len(valid_y)], 'blue', label='Validation data')
plt.plot(df['close_1min'][len(train_y)+len(valid_y):], 'orange', label='Test data')
plt.legend()