# Data import and verification


In [1]:
import os
import pandas as pd
import numpy as np
# Set your working directory
# os.chdir('/Euro Dollar')


In [2]:
def reformat_date(df):
    for idx, date in enumerate(df['Date']):
        # check if the date does not contain a "/"
        if "/" not in str(date):            # reformat the date to "mm/dd/yyyy"

            # convert the string to a datetime object
            date_obj = pd.to_datetime(date, format='%y%m%d')
# .strftime('%m/%d/%Y')
        else:
            date_obj = pd.to_datetime(date, format='%m/%d/%Y')
            # reformat the date to "mm/dd/yyyy"
        df.loc[idx, 'Date'] = date_obj.strftime('%m/%d/%Y')
        
    return df

In [3]:
# Create a dictionary to store your data
data = {}

# Go through each file in the directory
for file in os.listdir('Euro Dollar/'):
    # Check if the file is a text file
    if file.endswith('.txt'):
        # Load the text file into a DataFrame
        df = pd.read_csv('Euro Dollar/'+file, sep=",", names =["Date","Open","High","Low","Close","Volume","OpenInt"])
        df = reformat_date(df)
        # Use the filename (without the .txt extension) as the key
        key = file[:-4]
        # Store the DataFrame in the dictionary
        data[key] = df


In [4]:
first_contract = list(data.keys())[0]
print(data[first_contract].dtypes)


Date        object
Open       float64
High       float64
Low        float64
Close      float64
Volume       int64
OpenInt      int64
dtype: object


In [5]:
data[list(data.keys())[15]]

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,02/11/1982,84.35,84.50,84.35,84.35,21,21
1,02/12/1982,84.35,84.35,84.35,84.35,0,21
2,02/16/1982,84.20,84.45,84.18,84.45,20,38
3,02/17/1982,84.59,84.64,84.45,84.45,8,41
4,02/18/1982,84.70,84.70,84.70,84.70,0,41
...,...,...,...,...,...,...,...
207,12/07/1982,90.62,90.65,90.58,90.58,616,3207
208,12/08/1982,90.56,90.58,90.46,90.48,594,3052
209,12/09/1982,90.48,90.51,90.34,90.50,871,2530
210,12/10/1982,90.37,90.37,90.18,90.21,858,2038


# Handle missing values

In [6]:
for contract, df in data.items():
    if df.isna().any().any():
        print(f"Missing values in {contract}")

# Date feature engineering

In [7]:
for contract, df in data.items():
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=False)

In [8]:
for contract, df in data.items():
    df['Year'] = df['Date'].dt.year
    df['Quarter'] = df['Date'].dt.quarter
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek


In [9]:
for contract, df in data.items():
    df['Month_sin'] = np.sin(2 * np.pi * df['Month']/12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month']/12)
    df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek']/7)
    df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek']/7)
    # And so on for other cyclical features


In [10]:
first_contract = list(data.keys())[7]
print(data[first_contract].head())


        Date   Open   High    Low  Close  Volume  OpenInt  Year  Quarter  \
0 1993-12-06  92.58  92.58  92.58  92.58       0       10  1993        4   
1 1993-12-08  92.59  92.59  92.59  92.59       0       10  1993        4   
2 1993-12-09  92.63  92.63  92.63  92.63       0       10  1993        4   
3 1993-12-10  92.61  92.61  92.61  92.61       0       10  1993        4   
4 1993-12-13  92.57  92.57  92.57  92.57       0       10  1993        4   

   Month  Week  Day  DayOfWeek     Month_sin  Month_cos  DayOfWeek_sin  \
0     12    49    6          0 -2.449294e-16        1.0       0.000000   
1     12    49    8          2 -2.449294e-16        1.0       0.974928   
2     12    49    9          3 -2.449294e-16        1.0       0.433884   
3     12    49   10          4 -2.449294e-16        1.0      -0.433884   
4     12    50   13          0 -2.449294e-16        1.0       0.000000   

   DayOfWeek_cos  
0       1.000000  
1      -0.222521  
2      -0.900969  
3      -0.900969  
4  

# Lag Features

# Split data into test and train sets

In [11]:
# Import the required library
from sklearn.model_selection import train_test_split

# Define a dictionary to hold your training and testing data
train_data = {}
test_data = {}

# Split each contract's data into a training and testing set
for contract, df in data.items():
    train, test = train_test_split(df, test_size=0.2, shuffle=False)
    train_data[contract] = train
    test_data[contract] = test


In [12]:
first_contract = list(train_data.keys())[0]
print(f"Training data for {first_contract}: {train_data[first_contract].shape}")
print(f"Testing data for {first_contract}: {test_data[first_contract].shape}")


Training data for ED00H: (440, 17)
Testing data for ED00H: (110, 17)


In [13]:
from sklearn.preprocessing import MinMaxScaler

# Define the columns to be scaled
cols_to_scale = ['Open', 'High', 'Low', 'Close', 'Volume', 'OpenInt']

# Initialize a scaler with a feature range from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))

# for contract, df in data.items():
#     df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
# Assume train and test are your training and testing data
scaler = MinMaxScaler(feature_range=(0, 1))

train[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])
test[cols_to_scale] = scaler.transform(test[cols_to_scale])
# Define the number of lags
n_lags = 3

# Define the columns for which you want to create lag features
cols_to_lag = ['Open', 'High', 'Low', 'Close', 'Volume', 'OpenInt']

for contract, df in data.items():
    for col in cols_to_lag:
        for lag in range(1, n_lags + 1):
            train[f'{col}_lag{lag}'] = train[col].shift(lag)
            test[f'{col}_lag{lag}'] = test[col].shift(lag)
test.dropna(inplace=True)
train.dropna(inplace=True)

for contract, df in data.items():
    df.dropna(inplace=True)


first_contract = list(data.keys())[4]
print(data[first_contract].head())


        Date   Open   High    Low  Close  Volume  OpenInt  Year  Quarter  \
0 1993-11-16  92.92  92.92  92.92  92.92       1       20  1993        4   
1 1993-11-17  92.95  92.95  92.95  92.95       0       20  1993        4   
2 1993-11-18  92.93  92.94  92.91  92.91       8       20  1993        4   
3 1993-11-19  92.81  92.81  92.81  92.81       0       20  1993        4   
4 1993-11-22  92.74  92.74  92.74  92.74       0       20  1993        4   

   Month  Week  Day  DayOfWeek  Month_sin  Month_cos  DayOfWeek_sin  \
0     11    46   16          1       -0.5   0.866025       0.781831   
1     11    46   17          2       -0.5   0.866025       0.974928   
2     11    46   18          3       -0.5   0.866025       0.433884   
3     11    46   19          4       -0.5   0.866025      -0.433884   
4     11    47   22          0       -0.5   0.866025       0.000000   

   DayOfWeek_cos  
0       0.623490  
1      -0.222521  
2      -0.900969  
3      -0.900969  
4       1.000000  


In [14]:
first_contract = list(data.keys())[0]
print(data[first_contract].head())

        Date   Open   High    Low  Close  Volume  OpenInt  Year  Quarter  \
0 1998-01-02  93.96  94.04  93.96  94.03    4766    77270  1998        1   
1 1998-01-05  94.12  94.21  94.11  94.20    6069    77700  1998        1   
2 1998-01-06  94.21  94.25  94.18  94.24    7574    77637  1998        1   
3 1998-01-07  94.22  94.27  94.22  94.22    6172    77888  1998        1   
4 1998-01-08  94.28  94.35  94.28  94.33    9414    79446  1998        1   

   Month  Week  Day  DayOfWeek  Month_sin  Month_cos  DayOfWeek_sin  \
0      1     1    2          4        0.5   0.866025      -0.433884   
1      1     2    5          0        0.5   0.866025       0.000000   
2      1     2    6          1        0.5   0.866025       0.781831   
3      1     2    7          2        0.5   0.866025       0.974928   
4      1     2    8          3        0.5   0.866025       0.433884   

   DayOfWeek_cos  
0      -0.900969  
1       1.000000  
2       0.623490  
3      -0.222521  
4      -0.900969  


In [15]:
train

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Year,Quarter,Month,...,Low_lag3,Close_lag1,Close_lag2,Close_lag3,Volume_lag1,Volume_lag2,Volume_lag3,OpenInt_lag1,OpenInt_lag2,OpenInt_lag3
3,1993-09-17,0.646575,0.636856,0.643741,0.637954,0.000423,0.000522,1993,3,9,...,0.660248,0.640646,0.648721,0.646030,0.000846,0.001016,0.001862,0.000426,0.000232,0.000000
4,1993-09-20,0.632877,0.623306,0.640990,0.627187,0.000254,0.000580,1993,3,9,...,0.632737,0.637954,0.640646,0.648721,0.000423,0.000846,0.001016,0.000522,0.000426,0.000232
5,1993-09-21,0.619178,0.617886,0.627235,0.616420,0.002539,0.001161,1993,3,9,...,0.654746,0.627187,0.637954,0.640646,0.000254,0.000423,0.000846,0.000580,0.000522,0.000426
6,1993-09-22,0.610959,0.601626,0.613480,0.602961,0.001354,0.001412,1993,3,9,...,0.643741,0.616420,0.627187,0.637954,0.002539,0.000254,0.000423,0.001161,0.000580,0.000522
7,1993-09-23,0.608219,0.604336,0.613480,0.600269,0.001168,0.001509,1993,3,9,...,0.640990,0.602961,0.616420,0.627187,0.001354,0.002539,0.000254,0.001412,0.001161,0.000580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,1998-09-02,0.928767,0.926829,0.935351,0.927322,0.403213,0.939989,1998,3,9,...,0.921596,0.932705,0.928668,0.916555,0.593013,0.303108,0.665307,0.954046,0.945371,0.939776
1256,1998-09-03,0.949315,0.941734,0.954608,0.936743,0.499154,0.958829,1998,3,9,...,0.925722,0.927322,0.932705,0.928668,0.403213,0.593013,0.303108,0.939989,0.954046,0.945371
1257,1998-09-04,0.938356,0.941734,0.943604,0.942127,0.380565,0.947039,1998,3,9,...,0.944979,0.936743,0.927322,0.932705,0.499154,0.403213,0.593013,0.958829,0.939989,0.954046
1258,1998-09-08,0.958904,0.966125,0.965612,0.963661,0.531636,0.981523,1998,3,9,...,0.935351,0.942127,0.936743,0.927322,0.380565,0.499154,0.403213,0.947039,0.958829,0.939989


In [16]:
def create_sequences(data, seq_length):
    xs = []
    ys = []
    dates = []
    
    for i in range(len(data) - seq_length - 1):
        x = data.iloc[i:(i + seq_length)].drop('Date', axis=1).values
        y = data.iloc[i + seq_length + 1]['Volume']
        date = data.iloc[i + seq_length + 1]['Date']
        xs.append(x)
        ys.append(y)
        dates.append(date)
    
    return np.array(xs), np.array(ys), np.array(dates)

seq_length = 10
train_sequences = {}
test_sequences = {}

for contract in train_data.keys():
    X_train, y_train, dates_train = create_sequences(train_data[contract], seq_length)
    X_test, y_test, dates_test = create_sequences(test_data[contract], seq_length)

    
    train_sequences[contract] = (X_train, y_train)
    test_sequences[contract] = (X_test, y_test)



In [17]:
first_contract = list(train_sequences.keys())[0]
print(f"Training sequences for {first_contract}: {train_sequences[first_contract][0].shape}")
print(f"Testing sequences for {first_contract}: {test_sequences[first_contract][0].shape}")


Training sequences for ED00H: (429, 10, 16)
Testing sequences for ED00H: (99, 10, 16)


In [18]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define your model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')


In [19]:
# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

predictions = model.predict(X_test)
result = pd.DataFrame({
    'Date': dates_test,
    'Prediction': predictions.flatten(),  # assuming predictions is 2D
    'Actual': y_test
})


# Define your roll over threshold
rollover_threshold = 0.8  # adjust this based on your data and task

# Check whether the predicted volume is below the threshold
rollover_prediction = predictions / X_test[:, -1, 4] < rollover_threshold

print(rollover_prediction)
true_predictions = result[result['Prediction'] < rollover_threshold]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]]


In [20]:
print(X_train.dtype)
print(y_train.dtype)
print(X_test.dtype)
print(y_test.dtype)


float64
float64
float64
float64


In [21]:
true_predictions


Unnamed: 0,Date,Prediction,Actual
0,1998-09-30,-0.734494,1.057237
1,1998-10-01,-1.014554,1.066895
2,1998-10-02,-1.144956,1.082102
3,1998-10-05,-2.904843,1.078848
4,1998-10-06,-4.171597,1.068276
...,...,...,...
297,1999-12-07,-4.219480,1.599524
298,1999-12-08,-3.188779,1.535705
299,1999-12-09,-3.275266,1.514079
300,1999-12-10,-2.037351,1.493021
