## Mount your Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
directory = '/content/gdrive/MyDrive/EC503/project_forex_timeseries/'

Mounted at /content/gdrive


## Add libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import dask.dataframe as dd
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

## Load and pre-process your data

In [None]:
# Load your CSV file

df = pd.read_csv(directory+"EURUSD_H1.csv")

# Drop unnecessary columns
df = df.drop(["Date", "Time", "Open"], axis=1)

# Normalize the "Volume" column
#scaler_volume = MinMaxScaler()
#df["Volume"] = scaler_volume.fit_transform(df["Volume"].values.reshape(-1, 1))

# Function to scale "High", "Low", and "Close" for each sequence
def scale_sequence(sequence):
    scaler = MinMaxScaler(feature_range=(0, 1))
    sequence[["Volume"]] = scaler.fit_transform(sequence[["Volume"]])
    maxVal = sequence["High"].max()
    minVal = sequence["Low"].min()
    sequence[["High", "Low", "Close"]] = (sequence[["High", "Low", "Close"]] - minVal) / (maxVal - minVal) * (0.9 - 0.1) + 0.1
    return maxVal, minVal, sequence

# Create lists for features and targets
features = []
targets = []

window_size = 504
future_steps = 12
MaxMin = []
for i in range(len(df) - (window_size + future_steps)):
    sequence = df.iloc[i:i + window_size].copy()

    # Scale features
    maxVal, minVal, sequence[["High", "Low", "Close", "Volume"]] = scale_sequence(sequence)
    features.append(sequence[["High", "Low", "Close", "Volume"]])
    # Extract target values and scale them
    target_max_high = df.iloc[i+window_size:i+window_size + future_steps]["High"].max()
    target_min_low = df.iloc[i+window_size:i+window_size + future_steps]["Low"].min()
    '''
    close           = df.iloc[i+window_size-1]["Close"].mean()
    label = 0
    if target_min_low >= close:
      label = 1
    elif target_max_high <= close:
      label = -1
    elif abs((target_max_high-close)/(target_min_low-close))>2:
      label = 1
    elif abs((target_max_high-close)/(target_min_low-close))<1/2:
      label = -1
    targets.append(label)
    '''
    MaxMin.append([maxVal, minVal])
    temp_target = ([target_max_high, target_min_low]- minVal) / (maxVal - minVal) * (0.9 - 0.1) + 0.1
    target_max_high_scaled, target_min_low_scaled = 1/(1+np.exp(-5.4931*temp_target+2.7465))
    targets.append([target_max_high_scaled, target_min_low_scaled])


# Convert lists to NumPy arrays
features_array = np.array(features)
targets_array = np.array(targets)
MaxMin_array  = np.array(MaxMin)
# Print the shapes of the arrays
print("Features shape:", features_array.shape)
print("Targets shape:", targets_array.shape)
print("MaxMin shape:", MaxMin_array.shape)

In [None]:
np.save(directory+"features_array.npy", features_array)
np.save(directory+"targets_array.npy", targets_array)
np.save(directory+"MaxMin_array.npy", MaxMin_array)

In [3]:
features_array = np.load(directory+"features_array.npy")
targets_array = np.load(directory+"targets_array.npy")
MaxMin_array = np.load(directory+"MaxMin_array.npy")

In [4]:
test_size = 0.2  # Adjust as needed
random_state = 42  # Set a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    features_array, targets_array, test_size=test_size, shuffle=False, random_state=random_state
)
np.random.seed(random_state)  # Set the seed for reproducibility
shuffle_indices = np.arange(X_train.shape[0])
np.random.shuffle(shuffle_indices)

X_train = X_train[shuffle_indices]
y_train = y_train[shuffle_indices]

MaxMin_train = MaxMin_array[:len(X_train),:]
MaxMin_train = MaxMin_train[shuffle_indices]
MaxMin_test  = MaxMin_array[len(X_train):,:]

In [6]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [10]:
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error

def insert_and_delete(arr, new_value, other_array1, arr_el1):
    # Step 1: Insertion
    inserted = False
    for i in range(len(arr)):
        if new_value <= arr[i]:
            arr.insert(i, new_value)
            inserted = True
            arr.pop()
            break

    # Step 3: Update Other Arrays
    if inserted:
      index_of_inserted_value = arr.index(new_value)
      other_array1.insert(index_of_inserted_value, arr_el1)  # Update other_array1
      other_array1.pop()

num_ran = 1000
y_predict = []
num_columns = X_train.shape[1]
selected_columns = np.zeros(num_ran)

num_best = 5

model_0 = auto_arima(X_train[1,:,0], exogenous = X_train[1,:,1:], trace=True, error_action="ignore", suppress_warnings=True)
model_0s = [model_0 for x in range(num_best)]
best_mses0 = [1000 for x in range(num_best)]

model_1 = auto_arima(X_train[1,:,1], exogenous = X_train[1,:,[0,2,3]], trace=True, error_action="ignore", suppress_warnings=True)
model_1s = [model_1 for x in range(num_best)]
best_mses1 = [1000 for x in range(num_best)]


for i in range(num_ran):
  random_index = np.random.choice(num_columns)
  selected_columns[i] = random_index

  model_0 = auto_arima(X_train[random_index,:,0], exogenous = X_train[random_index,:,1:], trace=True, error_action="ignore", suppress_warnings=True)
  model_0.fit(X_train[random_index,:,0], exogenous = X_train[random_index,:,0] )

  model_1 = auto_arima(X_train[random_index,:,1], exogenous = X_train[random_index,:,[0,2,3]], trace=True, error_action="ignore", suppress_warnings=True)
  model_1.fit(X_train[random_index,:,1], exogenous = X_train[random_index,:,[0,2,3]] )

  yhat_0 = model_0.predict(n_periods=12, exogenous=X_train[random_index,:,1:])
  yhat_1 = model_1.predict(n_periods=12, exogenous=X_train[random_index,:,[0,2,3]])

  curr_mse0 = mean_squared_error([np.max(yhat_0)],[y_train[random_index, 0]])
  curr_mse1 = mean_squared_error([np.min(yhat_1)],[y_train[random_index, 1]])

  insert_and_delete(best_mses0, curr_mse0, model_0s, model_0)
  insert_and_delete(best_mses1, curr_mse1, model_1s, model_1)


  y_predict.append([np.max(yhat_0),np.min(yhat_1)])




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Best model:  ARIMA(0,1,1)(0,0,0)[0]          
Total fit time: 4.011 seconds
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-2690.640, Time=0.72 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-2681.003, Time=0.09 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-2686.711, Time=0.06 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-2686.291, Time=0.22 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-2680.779, Time=0.10 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=-2683.065, Time=0.24 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=-2682.877, Time=0.98 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=-2681.309, Time=1.41 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=-2688.883, Time=2.48 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-2684.753, Time=0.48 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=-2683.371, Time=1.10 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=-2683.173, Time=0.60 sec
 ARIMA(3,1,3

KeyboardInterrupt: ignored

In [None]:
y_predict_test = []
for i in range(len(X_test)):#len(X_test)):
  temp = np.zeros((num_best,2))
  for j in range(num_best):
    yhat_0 = model_0.predict(n_periods=12, exogenous=X_test[random_index,:,1:])
    yhat_1 = model_1.predict(n_periods=12, exogenous=X_test[random_index,:,[0,2,3]])

    temp[j,:] = [np.max(yhat_0),np.min(yhat_1)]

  y_predict_test.append(np.mean(temp,axis=0))

In [None]:
mean_squared_error(y_predict_test,y_test)

In [None]:
y_predict_test.shape

In [None]:
y_train[random_index, 0]

0.176183728206255

0.18351204632722368

5.3704246482117925e-05

0.008016803638571443