#### You can find the dataset I started with, and finished with here:

Start dataset: https://drive.google.com/file/d/1HAWgJpsows16hIDv3vg1szH9XQ7CGR6C/view?usp=drive_link
New dataset: https://drive.google.com/file/d/1HAWgJpsows16hIDv3vg1szH9XQ7CGR6C/view?usp=drive_link

### Script to add SMA_50 and EMA_50 to the dataset and remove the 'TICKER' and 'PER' column

In [76]:
# Script to add SMA_50 and EMA_50 to the dataset and remove the 'TICKER' column

import pandas as pd
from stockstats import StockDataFrame

# Define the column names
columns = ['TICKER', 'PER', 'DATE', 'TIME', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL']

# Read the CSV file, skip the first row (if it is a header), and set the column names
# Change 'skiprows=1' to 'skiprows=0' if the first row contains data
df = pd.read_csv('/Users/ronangrant/Downloads/start_raw_ dataset_EURUSD.csv', sep=';', header=None, skiprows=1, names=columns)

# Remove the 'TICKER' column
df.drop(['TICKER','PER'], axis=1, inplace=True)

# Convert 'DATE' and 'TIME' into a single datetime column with specified format
df['datetime'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'], format='%m/%d/%y %H:%M:%S')
df = df.set_index('datetime')

# Drop the original 'DATE' and 'TIME' columns as they are no longer needed
df.drop(['DATE', 'TIME'], axis=1, inplace=True)

# Convert the DataFrame to a StockDataFrame for calculation
stock_df = StockDataFrame.retype(df)

# Calculate SMA and EMA for 50 periods
df['SMA_50'] = stock_df['close_50_sma'].round(5)
df['EMA_50'] = stock_df['close_50_ema'].round(5)


# Display the first few rows of the updated DataFrame
print(df.head())


                        open     high      low    close   vol   SMA_50  \
datetime                                                                 
2022-12-01 00:00:00  1.04072  1.04088  1.04015  1.04052  7317  1.04052   
2022-12-01 00:01:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-12-01 00:02:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-12-01 00:03:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-12-01 00:04:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   

                      EMA_50  
datetime                      
2022-12-01 00:00:00  1.04052  
2022-12-01 00:01:00  1.04025  
2022-12-01 00:02:00  1.04030  
2022-12-01 00:03:00  1.04033  
2022-12-01 00:04:00  1.04037  


In [24]:
df

Unnamed: 0_level_0,open,high,low,close,vol,SMA_50,EMA_50
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-12-01 00:00:00,1.04072,1.04088,1.04015,1.04052,7317,1.04052,1.04052
2022-12-01 00:01:00,1.04050,1.04090,1.03965,1.03999,7020,1.04026,1.04025
2022-12-01 00:02:00,1.03998,1.04055,1.03980,1.04039,5505,1.04030,1.04030
2022-12-01 00:03:00,1.04036,1.04062,1.04015,1.04040,2764,1.04032,1.04033
2022-12-01 00:04:00,1.04049,1.04058,1.04020,1.04054,1564,1.04037,1.04037
...,...,...,...,...,...,...,...
2023-12-25 23:55:00,1.10070,1.10070,1.10070,1.10070,2,1.10085,1.10082
2023-12-25 23:56:00,1.10070,1.10070,1.10070,1.10070,2,1.10085,1.10081
2023-12-25 23:57:00,1.10070,1.10070,1.10070,1.10070,2,1.10084,1.10081
2023-12-25 23:58:00,1.10070,1.10070,1.10070,1.10070,2,1.10084,1.10080


### Script to convert time zone of the DataFrame 'df' from MSK to ET

In [77]:
# Script to convert time zone of the DataFrame 'df' from MSK to ET

import pytz

# Assuming the current time zone is Moscow Time (MSK)
moscow_tz = pytz.timezone('Europe/Moscow')
df.index = df.index.tz_localize(moscow_tz)

# Convert to Eastern Time (ET)
et_tz = pytz.timezone('America/New_York')
df.index = df.index.tz_convert(et_tz)

# Display the first few rows of the updated DataFrame
print(df.head())


                              open     high      low    close   vol   SMA_50  \
datetime                                                                       
2022-11-30 16:00:00-05:00  1.04072  1.04088  1.04015  1.04052  7317  1.04052   
2022-11-30 16:01:00-05:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   

                            EMA_50  
datetime                            
2022-11-30 16:00:00-05:00  1.04052  
2022-11-30 16:01:00-05:00  1.04025  
2022-11-30 16:02:00-05:00  1.04030  
2022-11-30 16:03:00-05:00  1.04033  
2022-11-30 16:04:00-05:00  1.04037  


In [49]:
df

Unnamed: 0_level_0,open,high,low,close,vol,SMA_50,EMA_50
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-30 16:00:00-05:00,1.04072,1.04088,1.04015,1.04052,7317,1.04052,1.04052
2022-11-30 16:01:00-05:00,1.04050,1.04090,1.03965,1.03999,7020,1.04026,1.04025
2022-11-30 16:02:00-05:00,1.03998,1.04055,1.03980,1.04039,5505,1.04030,1.04030
2022-11-30 16:03:00-05:00,1.04036,1.04062,1.04015,1.04040,2764,1.04032,1.04033
2022-11-30 16:04:00-05:00,1.04049,1.04058,1.04020,1.04054,1564,1.04037,1.04037
...,...,...,...,...,...,...,...
2023-12-25 15:55:00-05:00,1.10070,1.10070,1.10070,1.10070,2,1.10085,1.10082
2023-12-25 15:56:00-05:00,1.10070,1.10070,1.10070,1.10070,2,1.10085,1.10081
2023-12-25 15:57:00-05:00,1.10070,1.10070,1.10070,1.10070,2,1.10084,1.10081
2023-12-25 15:58:00-05:00,1.10070,1.10070,1.10070,1.10070,2,1.10084,1.10080


### # Script to add RSI_14, MACD, MACD_Signal, SMA_20, Bollinger Bands, Day of Week, %K, %D, and ATR to the dataset

In [78]:
# Script to add RSI_14, MACD, MACD_Signal, SMA_20, Bollinger Bands, Day of Week, %K, %D, and ATR to the dataset

import pandas as pd
from stockstats import StockDataFrame

# Convert the DataFrame to a StockDataFrame for calculation
stock_df = StockDataFrame.retype(df)

# Calculate RSI for 14 periods
df['RSI_14'] = stock_df['rsi_14']

# Calculate MACD and MACD Signal
df['MACD'] = stock_df['macd']
df['MACD_Signal'] = stock_df['macds']

# Calculate SMA for 20 periods
df['SMA_20'] = stock_df['close_20_sma']

# Calculate Bollinger Bands
df['Bollinger_Upper'] = stock_df['boll_ub']
df['Bollinger_Lower'] = stock_df['boll_lb']

# Extract day of the week from the datetime index
df['day_of_week'] = df.index.dayofweek

# Calculate %K and %D for Stochastic Oscillator
df['%K'] = stock_df['kdjk']
df['%D'] = stock_df['kdjd']

# Calculate Average True Range (ATR)
df['ATR'] = stock_df['atr_14']

# Display the first few rows of the updated DataFrame
print(df.head())


                              open     high      low    close   vol   sma_50  \
datetime                                                                       
2022-11-30 16:00:00-05:00  1.04072  1.04088  1.04015  1.04052  7317  1.04052   
2022-11-30 16:01:00-05:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   

                            ema_50     RSI_14      MACD  MACD_Signal  \
datetime                                                               
2022-11-30 16:00:00-05:00  1.04052        NaN  0.000000     0.000000   
2022-11-30 16:01:00-05:00  1.04025   0.000000 -0.000012    -0.000007   
2022-11-30 16:02:00-05:00  1.04030  44.835869 -0.000003    -0.000005   
2022-11-30 16:03:00-05:00  1.04033  45.493823  0.000002    -0.000003   
2022-11

### # Script to round selected columns to different decimal places

In [79]:
# Script to round selected columns to different decimal places

# Rounding RSI_14, %K, %D to 2 decimal places
df[['RSI_14', '%K', '%D']] = df[['RSI_14', '%K', '%D']].round(3)

# Rounding MACD, MACD_Signal, ATR to 6 decimal places
df[['MACD', 'MACD_Signal', 'ATR']] = df[['MACD', 'MACD_Signal', 'ATR']].round(7)

# Rounding SMA_20, Bollinger_Upper, Bollinger_Lower to 5 decimal places
df[['SMA_20', 'Bollinger_Upper', 'Bollinger_Lower']] = df[['SMA_20', 'Bollinger_Upper', 'Bollinger_Lower']].round(5)

# Display the first few rows of the updated DataFrame
print(df.head())


                              open     high      low    close   vol   sma_50  \
datetime                                                                       
2022-11-30 16:00:00-05:00  1.04072  1.04088  1.04015  1.04052  7317  1.04052   
2022-11-30 16:01:00-05:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   

                            ema_50  RSI_14      MACD  MACD_Signal   SMA_20  \
datetime                                                                     
2022-11-30 16:00:00-05:00  1.04052     NaN  0.000000     0.000000  1.04052   
2022-11-30 16:01:00-05:00  1.04025   0.000 -0.000012    -0.000007  1.04026   
2022-11-30 16:02:00-05:00  1.04030  44.836 -0.000003    -0.000005  1.04030   
2022-11-30 16:03:00-05:00  1.04033  45.494  0.000

### Script to clean the dataset by removing weekends, specific Fridays, NaN values, and specific dates


In [80]:
# Script to clean the dataset by removing weekends, specific Fridays, NaN values, and specific dates

# Import necessary libraries
import pandas as pd

# Convert index to datetime if it's not already
df.index = pd.to_datetime(df.index)

# Remove Saturdays and Sundays
df = df[df.index.dayofweek < 5]

# Remove Fridays after 11 am
df = df[~((df.index.dayofweek == 4) & (df.index.hour >= 11))]

# Remove rows with any NaN or missing values
df = df.dropna()

# List of specific dates to remove
dates_to_remove = ["2023-12-25", "2023-11-16", "2023-10-31", "2022-12-08", 
                   "2023-03-29", "2023-05-17", "2023-07-11", "2023-08-02", 
                   "2023-01-16", "2023-02-20", "2023-04-07", "2023-05-29", 
                   "2023-06-19", "2023-07-04"]

# Convert the list to datetime for comparison
dates_to_remove = pd.to_datetime(dates_to_remove)

# Remove the specific dates
df = df[~df.index.normalize().isin(dates_to_remove)]

# Display the first few rows of the cleaned DataFrame
print(df.head())


                              open     high      low    close   vol   sma_50  \
datetime                                                                       
2022-11-30 16:01:00-05:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   
2022-11-30 16:05:00-05:00  1.04053  1.04070  1.04030  1.04052  1333  1.04039   

                            ema_50  RSI_14      MACD  MACD_Signal   SMA_20  \
datetime                                                                     
2022-11-30 16:01:00-05:00  1.04025   0.000 -0.000012    -0.000007  1.04026   
2022-11-30 16:02:00-05:00  1.04030  44.836 -0.000003    -0.000005  1.04030   
2022-11-30 16:03:00-05:00  1.04033  45.494  0.000002    -0.000003  1.04032   
2022-11-30 16:04:00-05:00  1.04037  53.802  0.000

### Script to create a target column for binary options prediction


In [81]:
# Script to create a target column for binary options prediction

# Define the function to create the target variable
def create_target_variable(df, future_minutes):
    """
    Creates a target variable indicating if the price goes up or down in the specified future timeframe.
    1 indicates an increase, and 0 indicates a decrease or no change.
    """
    # Shift the closing price by the specified future minutes
    df['Future_Close'] = df['close'].shift(-future_minutes)

    # Determine if the future closing price is higher than the current closing price
    df['Target'] = (df['Future_Close'] > df['close']).astype(int)

    # Drop the last 'future_minutes' rows as they won't have a future price
    df.dropna(subset=['Future_Close'], inplace=True)

    # Optionally, drop the 'Future_Close' column if it's no longer needed
    df.drop(columns=['Future_Close'], inplace=True)

    return df

# Specify the future time frame for prediction (60 minutes)
future_minutes = 60

# Create the target column
df = create_target_variable(df, future_minutes)

# Display the first few rows of the updated DataFrame
print(df.head())


                              open     high      low    close   vol   sma_50  \
datetime                                                                       
2022-11-30 16:01:00-05:00  1.04050  1.04090  1.03965  1.03999  7020  1.04026   
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   
2022-11-30 16:05:00-05:00  1.04053  1.04070  1.04030  1.04052  1333  1.04039   

                            ema_50  RSI_14      MACD  MACD_Signal   SMA_20  \
datetime                                                                     
2022-11-30 16:01:00-05:00  1.04025   0.000 -0.000012    -0.000007  1.04026   
2022-11-30 16:02:00-05:00  1.04030  44.836 -0.000003    -0.000005  1.04030   
2022-11-30 16:03:00-05:00  1.04033  45.494  0.000002    -0.000003  1.04032   
2022-11-30 16:04:00-05:00  1.04037  53.802  0.000

### Remove possible off days or outliers

In [82]:
# Script to remove rows where RSI_14 is 0

# Filter out rows where RSI_14 is 0
df = df[df['RSI_14'] != 0]

# Display the first few rows of the updated DataFrame
print(df.head())


                              open     high      low    close   vol   sma_50  \
datetime                                                                       
2022-11-30 16:02:00-05:00  1.03998  1.04055  1.03980  1.04039  5505  1.04030   
2022-11-30 16:03:00-05:00  1.04036  1.04062  1.04015  1.04040  2764  1.04032   
2022-11-30 16:04:00-05:00  1.04049  1.04058  1.04020  1.04054  1564  1.04037   
2022-11-30 16:05:00-05:00  1.04053  1.04070  1.04030  1.04052  1333  1.04039   
2022-11-30 16:06:00-05:00  1.04057  1.04087  1.04034  1.04061  2002  1.04042   

                            ema_50  RSI_14      MACD  MACD_Signal   SMA_20  \
datetime                                                                     
2022-11-30 16:02:00-05:00  1.04030  44.836 -0.000003    -0.000005  1.04030   
2022-11-30 16:03:00-05:00  1.04033  45.494  0.000002    -0.000003  1.04032   
2022-11-30 16:04:00-05:00  1.04037  53.802  0.000010     0.000001  1.04037   
2022-11-30 16:05:00-05:00  1.04040  52.569  0.000

### Script to save the csv file

In [83]:
# Script to save the DataFrame 'df' to a CSV file, including the datetime index

# Save the DataFrame as a CSV file
csv_file_path = '/Users/ronangrant/Downloads/dataset_stockstat.csv'
df.to_csv(csv_file_path, index=True)  

### Random forest with time columns

In [84]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def train_and_evaluate_model(file_path):
    print(f"Starting processing on {file_path}")

    # Load the dataset
    df = pd.read_csv(file_path)

    # Convert 'datetime' from index to hour and minute, then drop the original 'datetime' column
    df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df.drop('datetime', axis=1, inplace=True)

    # Define features and target
    X = df.drop('Target', axis=1)
    y = df['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest classifier
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

# File path for the dataset
dataset_path = '/Users/ronangrant/Downloads/dataset_stockstat.csv'  # Replace with your CSV file path

# Train the model and evaluate its performance
train_and_evaluate_model(dataset_path)


Starting processing on /Users/ronangrant/Downloads/dataset_stockstat.csv
Accuracy: 0.9259897702546133


### Without time column, as these values may be off.

In [87]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def train_and_evaluate_model(file_path):
    print(f"Starting processing on {file_path}")

    # Load the dataset
    df = pd.read_csv(file_path)

    # Drop columns that are not numeric (like datetime strings)
    columns_to_drop = ['datetime']
    df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

    # Define features and target
    # Ensure 'Target' is your actual target column name
    X = df.drop('Target', axis=1)
    y = df['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest classifier
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

# File path for the dataset
dataset_path = '/Users/ronangrant/Downloads/dataset_stockstat.csv'  # Replace with your CSV file path

# Train the model and evaluate its performance
train_and_evaluate_model(dataset_path)


Starting processing on /Users/ronangrant/Downloads/dataset_stockstat.csv
           open     high      low    close   vol   sma_50   ema_50  RSI_14  \
0       1.03998  1.04055  1.03980  1.04039  5505  1.04030  1.04030  44.836   
1       1.04036  1.04062  1.04015  1.04040  2764  1.04032  1.04033  45.494   
2       1.04049  1.04058  1.04020  1.04054  1564  1.04037  1.04037  53.802   
3       1.04053  1.04070  1.04030  1.04052  1333  1.04039  1.04040  52.569   
4       1.04057  1.04087  1.04034  1.04061  2002  1.04042  1.04043  57.309   
...         ...      ...      ...      ...   ...      ...      ...     ...   
353861  1.10080  1.10090  1.10080  1.10090     2  1.10085  1.10083  76.472   
353862  1.10090  1.10090  1.10090  1.10090     2  1.10084  1.10083  76.472   
353863  1.10090  1.10090  1.10090  1.10090     2  1.10083  1.10083  76.472   
353864  1.10090  1.10090  1.10090  1.10090     2  1.10083  1.10083  76.472   
353865  1.10090  1.10090  1.10090  1.10090     2  1.10082  1.10084  7