Import necessary libraries

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam

# Data Processing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pi
import plotly.express as px

import math



In [None]:
# Accessing My Google Drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


#Data Collection

Load the dataset

In [None]:
# Step 1: Data Collection
df = pd.read_csv('/content/drive/MyDrive/Classroom/EGCO623_T1 2023 Master/Project/EURUSD.csv')
print(df.shape)

Data Describtion

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.info()

Data cleaning

In [None]:
# Check Is there any null values in the Dataset
df.isnull().sum()

In [None]:
# Check the Duplicate value in the Dataset
df.duplicated().sum()

0

Data transformation

In [None]:
# convert 'Date' dtype to datatime
df["From"] = pd.to_datetime(df["From"])
df["To"] = pd.to_datetime(df["To"])
# check
df.info()

In [None]:
df.describe().round(6).T #round(6) ทศนิยม 6 ตำแหน่ง

Some observation from the data statitics:
1. There are 110,000 observations in the dataset.
2. The minimum and maximum values for Open, High, Low, Close, and Adj Close prices are significantly different,   indicating a large range of values for these variables.
3. The volume values in the dataset also have a large range of values, with a mean value of 995.95.

#EDA

In [None]:
fig = px.line(df, x='From', y='Close', title='Closing Prices Over Time - EURUSD')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Closing Price - EURUSD')
fig.update_layout(template='plotly_dark')
fig.show()

Visualize the Correlation Matrix

In [None]:
# Lets Visualise the Correlation Matrix
plt.figure(figsize=(5,3))
sns.heatmap(df.corr(), fmt=".2f", annot=True, cmap='Greens')
plt.title(f'Correlation Matrix - EURUSD')
plt.show()

Heatmap of correlation between features:
* Shows the correlation between all the features in the dataset
* Indicates that the Close Price has a strong positive correlation with Open, High, and Low, but a weak negative correlation with Volume
* Also indicates that the Open, High, and Low are highly correlated with each other

In [None]:
# Pairplot of features

sns.pairplot(data=df, vars=['Open', 'High', 'Low', 'Close', 'Volume'])
plt.title('Pairplot of Features - EURUSD')
plt.show()

Pairplot of features:
* Shows the pairwise relationships between all the features in the dataset

* Indicates that the Open, High, and Low are strongly positively correlated with each other, with a linear relationship

* Also indicates that the Volume feature is not strongly correlated with any of the other features

In [None]:
# Create a line chart using the dataset with two lines, 'High' and 'Low' on the y-axis

list1=["Open","High","Low","Close"]
list2=["High", "Low"]
list3=["Open","Close"]
list4=[list1, list2, list3]


for i in range(len(list4)):
    fig = px.line(df, x="From", y=list4[i], title="Distribution over Different Time Window - EURUSD")

    # Update the x-axis settings to include a range slider
    # This allows users to zoom in on specific date ranges
    fig.update_xaxes(
    rangeslider_visible=True,

    # Configure a rangeselector with buttons for different time intervals
    rangeselector=dict(
    buttons=list([
    dict(count=1, label="1m", step="month", stepmode="backward"),
    dict(count=6, label="6m", step="month", stepmode="backward"),
    dict(count=1, label="1Y", step="year", stepmode="todate"),
    dict(step="all")
    ])
    )
    )

  # Display the figure
    fig.show()

In [None]:
df_transformed = pd.DataFrame()

In [None]:
df_transformed['From'] = df['From']

# log transformation and check
for col in ['Open', 'Close', 'Low', 'High', 'Volume']:
    df_transformed[col] = np.log1p(df[col])

In [None]:
df_transformed.shape

#Data Preprocessing

In [None]:
# Normalize
scaler_x = MinMaxScaler() #x
scaler_y = MinMaxScaler() #Y
#scaler = MinMaxScaler()

Data Normalization

In [None]:
# scaling
def scale_data(data):

    arr_x = scaler_x.fit_transform(data.drop(['From','Close'], axis=1).values)
    arr_y = scaler_y.fit_transform(data['Close'].values.reshape(-1, 1))

    arr_result = np.concatenate([arr_y, arr_x], 1)

    return arr_result

In [None]:
arr_transformed = scale_data(df_transformed)

In [None]:
arr_transformed

In [None]:
arr_transformed.shape

In [None]:
# function that make sequence data
def create_sequences(data, seq_length):
    X, y = [], []

    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length]) # data of past candles
        y.append(data[i + seq_length][0]) # 'Close' of next candle

    return np.array(X), np.array(y)

In [None]:
seq_length = 10  # the number of past candles to be used for predictions
X, y = create_sequences(arr_transformed, seq_length)

In [None]:
X.shape

In [None]:
y.shape

Data Splitting

In [None]:
train_size = int(len(X) * 0.7)
val_size = int(len(X) * 0.1)
test_size = len(X) - train_size - val_size
X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]


In [None]:
print(f"X_train: {X_train.shape}, y_train:{y_train.shape}")
print(f"X_val: {X_val.shape}, y_val:{y_val.shape}")
print(f"X_test: {X_test.shape}, y_test:{y_test.shape}")

#Model Training

# LSTM

In [None]:
# LSTM
model_LSTM = Sequential([
    LSTM(units = 50, activation = 'relu', return_sequences = True, input_shape = (seq_length, 5)),
    LSTM(units = 50, activation = 'relu', return_sequences = False),
    Dense(units = 1)

])

model_LSTM.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
model_LSTM.summary()

In [None]:
model_LSTM_history = model_LSTM.fit(
    X_train, y_train,
    epochs = 30,
    batch_size = 128,
    validation_data = (X_val, y_val)
)


In [None]:
# Save the trained model
model_LSTM.save('EURUSD_LSTM.h5')

In [None]:
# loss curve
plt.figure(figsize = (5, 3))

plt.plot(model_LSTM_history.history['loss'], color = "blue")
plt.plot(model_LSTM_history.history['val_loss'], color = "orange")

plt.title('Loss curve - EURUSD')
plt.xlabel("epoch")
plt.ylabel("loss")
plt.legend(['loss','val_loss'])

plt.show()

# Model Evaluation

#LSTM

In [None]:
# prediction
train_predict=model_LSTM.predict(X_train)
val_predict=model_LSTM.predict(X_val)
test_predict=model_LSTM.predict(X_test)
#y_pred= np.concatenate([model_LSTM.predict(X_train),model_LSTM.predict(X_val), model_LSTM.predict(X_test)])
y_pred= np.concatenate([train_predict,val_predict, test_predict])
y_pred_rescaled = scaler_y.inverse_transform(y_pred)
y_pred_rescaled = np.expm1(y_pred_rescaled)

In [None]:
# inverse scaling: actual y
y_rescaled = np.expm1(scaler_y.inverse_transform(y.reshape(-1, 1)))

In [None]:
# Calculate the performance indicators
mae = mean_absolute_error(y_test, test_predict)
mse= mean_squared_error(y_test, test_predict)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, test_predict)
mape = mean_absolute_percentage_error(y_test, test_predict)
mfe = np.mean(test_predict - y_test)
# actual_diff = np.diff(y_test)
# mase = np.mean(np.abs(y_pred - y_test)) / np.mean(np.abs(actual_diff))

In [None]:
# Print the performance indicators
# Mean Absolute Error (MAE): This measures the average absolute errors between actual and predicted values. It gives an idea of the magnitude of the errors.
print('MAE:', mae)
# Mean Squared Error (MSE): This measures the average of the squares of the errors. It penalizes larger errors more than smaller ones.
print('MSE:', mse)
# Root Mean Squared Error (RMSE): This is the square root of the MSE and provides an interpretable scale for the errors.
print('RMSE:', rmse)
# R-squared (R2): This measures the proportion of the variance in the dependent variable that is predictable from the independent variables.
# It provides an indication of the goodness of fit of the model.
print('R-squared:', r2)
# Mean Absolute Percentage Error (MAPE): This measures the average of the absolute percentage differences between actual and predicted values.
# It is useful for understanding the accuracy of the model's predictions.
print('MAPE:', mape)
# Mean Forecast Error (MFE): This measures the average of the forecast errors.
# A positive MFE indicates that the forecasts are, on average, too low, while a negative MFE indicates that the forecasts are, on average, too high.
print('MFE:', mfe)
# print('MASE:', mase)

In [None]:
#y_pred= np.concatenate([model_LSTM.predict(X_train),model_LSTM.predict(X_val), model_LSTM.predict(X_test)])
y_pred= np.concatenate([train_predict, val_predict, test_predict])
y_pred_rescaled = scaler_y.inverse_transform(y_pred)
y_pred_rescaled = np.expm1(y_pred_rescaled)

train_predict = scaler_y.inverse_transform(train_predict)
train_predict = np.expm1(train_predict)
#train_predict = np.expm1(scaler_y.inverse_transform(y.reshape(-1, 1)))

val_predict = scaler_y.inverse_transform(val_predict)
val_predict = np.expm1(val_predict)
#val_predict = np.expm1(scaler_y.inverse_transform(y.reshape(-1, 1)))

test_predict = scaler_y.inverse_transform(test_predict)
test_predict = np.expm1(test_predict)
#test_predict = np.expm1(scaler_y.inverse_transform(y.reshape(-1, 1)))

In [None]:
# Plotting the baseline data, training predictions, and test predictions
plt.figure(figsize=(15, 6))
plt.plot(y_rescaled, color='black', label=f"Actual Close price")
plt.plot(y_pred_rescaled, color='blue', label=f"Predicted Close price", linestyle = "--")

plt.title(f"Close Price - EURUSD")
plt.xlabel("Date")
plt.ylabel(f"Close Price")
plt.legend()
plt.show()

In [None]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Close Predictions'))

train_close_pred = y_pred_rescaled[:, 0]
train_close_actual = y_rescaled[:, 0]

fig.add_trace(go.Scatter(x=np.arange(len(train_close_actual)), y=train_close_actual, mode='lines', name='Actual', opacity=0.9))
fig.add_trace(go.Scatter(x=np.arange(len(train_close_pred)), y=train_close_pred, mode='lines', name='Predicted', opacity=0.6))

fig.update_layout(title='Close Predictions - EURUSD', template='plotly_dark')
fig.show()

In [None]:
# Plotting the baseline data, training predictions, and test predictions
plt.figure(figsize=(15, 6))
plt.plot(y_rescaled, color='black', label=f"Actual Close price")
plt.plot(train_predict, color='blue', label=f"Predicted Close price(train set)", linestyle = "--")
plt.plot(np.concatenate([np.full_like(train_predict, np.nan), val_predict]), color='red', label=f"Predicted Close price(validat set)", linestyle = "--")
plt.plot(np.concatenate([np.full_like(train_predict, np.nan),np.full_like(val_predict, np.nan), test_predict]), color='green', label=f"Predicted Close price(test set)", linestyle = "--")

plt.title(f"Close Price - EURUSD")
plt.xlabel("Date")
plt.ylabel(f"Close Price")
plt.legend()
plt.show()

In [None]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Close Predictions - EURUSD'))

train_close_actual = y_rescaled[:, 0]
train_pred = train_predict[:, 0]
val_pred = val_predict[:, 0]
test_pred = test_predict[:, 0]

# Create the extended x-axis for val_pred
train_x = df['From'][:len(train_pred)]
val_x = df['From'][len(train_pred):len(train_pred)+len(val_pred)]
test_x = df['From'][len(train_pred)+len(val_pred):]

fig.add_trace(go.Scatter(x=df['From'], y=train_close_actual, mode='lines', name='Actual', opacity=0.9, line=dict(color='black')))
fig.add_trace(go.Scatter(x=train_x, y=train_pred, mode='lines', name='Train Predicted', opacity=0.6, line=dict(dash='dash',color='blue')))
fig.add_trace(go.Scatter(x=val_x, y=val_pred, mode='lines', name='Val Predicted', opacity=0.6, line=dict(dash='dash',color='red')))
fig.add_trace(go.Scatter(x=test_x, y=test_pred, mode='lines', name='Test Predicted', opacity=0.6, line=dict(dash='dash',color='green')))

fig.update_layout(title='Close Predictions - EURUSD', template='plotly_white')
fig.show()

In [None]:
# Plotting the baseline data, training predictions, and test predictions
plt.figure(figsize=(15, 6))
plt.plot(np.concatenate([np.full_like(train_predict, np.nan),y_rescaled[len(train_predict):]]), color='black', label=f"Actual Close price")
plt.plot(np.concatenate([np.full_like(train_predict, np.nan), val_predict]), color='red', label=f"Predicted Close price(validat set)", linestyle = "--")
plt.plot(np.concatenate([np.full_like(train_predict, np.nan),np.full_like(val_predict, np.nan), test_predict]), color='green', label=f"Predicted Close price(test set)", linestyle = "--")

plt.title(f"Close Price - EURUSD")
plt.xlabel("Date")
plt.ylabel(f"Close Price")
plt.legend()
plt.show()

In [None]:
# df for analyze residual
df_LSTM_diff = pd.concat([pd.DataFrame(y_rescaled, columns = ['Close_actual']),
                               pd.DataFrame(y_pred_rescaled, columns = ['Close_pred'])],
                              axis = 1)
df_LSTM_diff['From'] = df['From']
df_LSTM_diff['resid'] = df_LSTM_diff['Close_pred'] - df_LSTM_diff['Close_actual']

# check
df_LSTM_diff.head(10)