In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from helper_functions import add_technical_indicators

In [39]:
#Load the data from the pickle file
with open("data/1y_data.pickle", 'rb') as file:
    data_structure = pickle.load(file)

data_structure = [data_structure[0],
data_structure[1],
data_structure[2],
data_structure[3],
data_structure[4]]

data_array = np.array(data_structure).T

#Assuming the structure is [timestamps, close, high, low, volume]
timestamps = pd.to_datetime(data_array[:, 0], unit='ms')
close_prices = data_array[:, 1]
high_prices = data_array[:, 2]
low_prices = data_array[:, 3]
volumes = data_array[:, 4]

# Combine all features into a DataFrame
data_df = pd.DataFrame({
    'timestamps': timestamps,
    'close': close_prices,
    'high': high_prices,
    'low': low_prices,
    'volume': volumes
})
data_df = add_technical_indicators(data_df)
data_df['hour'] = data_df['timestamps'].dt.hour
data_df['day_of_week'] = data_df['timestamps'].dt.dayofweek

# remove all NaN values
data_df.dropna(inplace=True)
data_df.drop('timestamps', axis=1, inplace=True)
data_df.drop('date', axis=1, inplace=True)
data_df.drop('open', axis=1, inplace=True)
data_df.drop('trade', axis=1, inplace=True)
data_df.drop('SMA_5', axis=1, inplace=True)
data_df.drop('SMA_15', axis=1, inplace=True)
data_df.drop('upper_Shadow', axis=1, inplace=True)
data_df.drop('lower_Shadow', axis=1, inplace=True)
data_df.drop('shadow1', axis=1, inplace=True)
data_df.drop('shadow3', axis=1, inplace=True)
data_df.drop('shadow5', axis=1, inplace=True)
data_df.drop('high_div_low', axis=1, inplace=True)


In [35]:
data_df

Unnamed: 0,close,high,low,volume,EMA_5,EMA_15,RSI,MACD,Signal_Line,mean1,mean2,hour,day_of_week
14,17159.49,17163.26,17151.46,644.57409,17161.495947,17151.114156,56.347232,9.195872,6.384493,-0.005407,322.272485,1,0
15,17155.46,17160.70,17145.52,680.98496,17159.483964,17151.657387,53.335936,8.645122,6.836619,-0.005595,340.475740,1,0
16,17155.85,17172.01,17154.42,593.46650,17158.272643,17152.181463,46.881635,8.146214,7.098538,-0.004061,296.714369,1,0
17,17169.35,17173.08,17155.22,557.72010,17161.965095,17154.327530,63.035517,8.739420,7.426714,0.008024,278.852062,1,0
18,17153.02,17171.08,17148.58,560.13752,17158.983397,17154.164089,60.281283,7.801911,7.501754,-0.002446,280.046230,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105348,46722.84,46791.10,46688.19,109.51639,46793.514219,46787.815423,55.148784,30.374832,31.593747,-0.572015,53.716342,20,1
105349,46768.65,46776.03,46711.11,84.79254,46785.226146,46785.419745,51.392490,25.948925,30.464783,-0.692691,41.320763,20,1
105350,46701.23,46783.96,46671.10,135.79803,46757.227431,46774.896027,49.614068,16.807386,27.733303,-0.504241,66.979230,20,1
105351,46691.39,46719.95,46669.35,84.35015,46735.281621,46764.457773,45.133302,8.668717,23.920386,-1.239180,40.635955,20,1


In [40]:
list(data_df.columns.values)

['close',
 'high',
 'low',
 'volume',
 'EMA_5',
 'EMA_15',
 'RSI',
 'MACD',
 'Signal_Line',
 'mean1',
 'mean2',
 'hour',
 'day_of_week']

In [41]:
correlation_matrix = data_df.corr()
target_correlation = correlation_matrix['close'].sort_values(ascending=False)

In [42]:
target_correlation

close          1.000000
high           0.999986
low            0.999986
EMA_5          0.999982
EMA_15         0.999931
Signal_Line    0.040385
MACD           0.038146
RSI            0.019174
day_of_week    0.004509
hour           0.001319
mean1         -0.284084
mean2         -0.340356
volume        -0.340566
Name: close, dtype: float64