In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os
import pandas as pd
import numpy as np

Building a function to assign a label to each row. The labels ('decrease', 'increase', 'minimal') are representative of whether the day's change following some premarket price change was positive, negative, or near-zero.

This function is called later in a lambda function to append a new column to the data.

In [2]:
# function to determine label to apply to data pair
def get_class(row) -> str:
    if row['day_change'] < 0:
        return 'decrease'
    elif row['day_change'] > 0:
        return 'increase'
    else:
        return 'minimal'

Iterating through all data with no consideration to underlying markets/sectors to concatenate all the data into a single dataframe.

In [None]:
# Collect all cleaned data and concat into a single DataFrame for fitting the model

directory = 'clean_stock_data'

# full_df = pd.DataFrame(columns=('premarket_change', 'day_change'))
df_tracker = []

n = 1
for filename in os.listdir(directory):
    
    # easy tracking of progress because I'm lazy part 2 hehe
    print(f"parsing data from {filename} to features and labels... {n}/500")
    n += 1

    try:
        data = pd.read_csv(f'{directory}/{filename}')
        df_tracker.append(data)
    except:
        print(f"Failed to copy data from {filename} to full_df...")

full_df = pd.concat(df_tracker, ignore_index=True)

Applying the get_class() function to assign labels to each row.

In [4]:
# adding label to each data pair
full_df['label'] = full_df.apply(lambda row: get_class(row), axis=1)

Parsing features and labels and calling train_test_split() to separate training data and validation data.

In [5]:
# features and labels
feature = 'premarket_change'
label = 'label'

# Parsing features and labels to separate lists
X = np.array(full_df[feature]).reshape(-1, 1)
y = full_df[label]

# train_test_split to separate training data and OOS data for validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

Fitting the model to the training data with default parameters.

In [None]:
# initiating model object
model = RandomForestClassifier(random_state=1)

# fitting model to training data
model.fit(train_X, train_y)

Assessing the accuracy of the models predictions.

In [7]:
predictions = model.predict(val_X)

print(accuracy_score(val_y, predictions))

0.4942883537118026


Resulting accuracy of 49.42883537118026%