In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer

In [111]:
df = pd.read_csv('../raw_data/sorted_feature_matrix.csv').drop(columns='Unnamed: 0')

In [112]:
df.isnull().sum()

Date                  0
Dividend              0
Volume                0
stock_price           0
fed_funds_rate        0
GDP                   0
Tickers               0
debt_to_equity        0
EPS                   0
return_on_equity      0
quick ratio           0
operating_ratio       0
inventory_turnover    0
pos_ma                0
neu_ma                0
neg_ma                0
dtype: int64

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32914 entries, 0 to 32913
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                32914 non-null  object 
 1   Dividend            32914 non-null  float64
 2   Volume              32914 non-null  float64
 3   stock_price         32914 non-null  float64
 4   fed_funds_rate      32914 non-null  float64
 5   GDP                 32914 non-null  float64
 6   Tickers             32914 non-null  object 
 7   debt_to_equity      32914 non-null  float64
 8   EPS                 32914 non-null  float64
 9   return_on_equity    32914 non-null  float64
 10  quick ratio         32914 non-null  float64
 11  operating_ratio     32914 non-null  float64
 12  inventory_turnover  32914 non-null  float64
 13  pos_ma              32914 non-null  float64
 14  neu_ma              32914 non-null  float64
 15  neg_ma              32914 non-null  float64
dtypes: f

## Todo
1. train-test-split
2. Standard scale float,
3. Create seq
4.  Model, embedding layer w mse and rmse


In [114]:

df['Date'] = pd.to_datetime(df['Date'])
data = df.copy()

data


Unnamed: 0,Date,Dividend,Volume,stock_price,fed_funds_rate,GDP,Tickers,debt_to_equity,EPS,return_on_equity,quick ratio,operating_ratio,inventory_turnover,pos_ma,neu_ma,neg_ma
0,2018-01-31,0.00,5.315047e+06,51.919825,1.41,4500.182000,AAL,-134.253165,0.807713,-0.941772,0.484585,0.476713,1.930838,0.285655,5.198588e-01,1.944860e-01
1,2018-01-31,0.00,1.795523e+08,40.275178,1.41,4500.182000,AAPL,2.413301,2.970566,0.131828,1.098802,0.743752,9.811931,0.222114,6.974815e-01,8.040471e-02
2,2018-01-31,0.71,7.794467e+06,87.747239,1.41,4500.182000,ABBV,-23.651147,1.818001,-0.940431,1.083967,0.657965,1.027436,0.351660,3.990092e-01,2.493303e-01
3,2018-01-31,0.28,7.860137e+06,54.655791,1.41,4500.182000,ABT,1.333073,0.319962,0.018336,1.128287,0.829424,0.837345,0.323488,5.800269e-01,9.648529e-02
4,2018-01-31,0.00,1.614810e+06,29.889667,1.41,4500.182000,ACGL,2.331811,0.564373,0.024308,0.679544,0.278000,0.000000,0.000003,1.541489e-05,9.999820e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32909,2023-09-30,0.00,2.114983e+06,133.590376,5.08,5105.025000,YUM,-1.693220,1.487544,-0.049550,0.721587,0.427386,0.938914,0.008204,4.912204e-01,5.005759e-01
32910,2023-09-30,0.00,1.614090e+06,135.349948,5.08,5105.025000,ZBH,0.720673,1.004794,0.016942,0.945561,0.834571,0.230908,0.499998,5.000007e-01,1.404991e-06
32911,2023-09-30,0.00,3.709700e+05,274.414666,5.08,5053.623333,ZBRA,1.491979,2.802807,0.048128,0.502260,0.854987,0.732639,0.999969,2.603792e-05,5.176677e-06
32912,2023-09-30,0.00,5.042810e+06,28.125514,5.08,5105.025000,ZION,15.511452,1.181283,0.033125,0.458553,0.388889,-17.000000,1.000000,2.596820e-07,6.590589e-08


In [115]:
data['Tickers'] = data['Tickers'].astype('string')

data.dtypes

Date                  datetime64[ns]
Dividend                     float64
Volume                       float64
stock_price                  float64
fed_funds_rate               float64
GDP                          float64
Tickers               string[python]
debt_to_equity               float64
EPS                          float64
return_on_equity             float64
quick ratio                  float64
operating_ratio              float64
inventory_turnover           float64
pos_ma                       float64
neu_ma                       float64
neg_ma                       float64
dtype: object

In [116]:
data[data.select_dtypes(include=['float64']).columns] = data.select_dtypes(include = ['float64']).astype('float32')

In [117]:
data.dtypes

Date                  datetime64[ns]
Dividend                     float32
Volume                       float32
stock_price                  float32
fed_funds_rate               float32
GDP                          float32
Tickers               string[python]
debt_to_equity               float32
EPS                          float32
return_on_equity             float32
quick ratio                  float32
operating_ratio              float32
inventory_turnover           float32
pos_ma                       float32
neu_ma                       float32
neg_ma                       float32
dtype: object

## Train- Test- Split

In [118]:
# Split point based on the time period for testing
test_period = pd.DateOffset(months=12)

train_data = []
test_data = []

ticker_groups = data.groupby('Tickers')

for ticker, group in ticker_groups:
    # Sort the group data by date in ascending order
    group.sort_values(by='Date', inplace=True)
    
    # Calculate the split date for this ticker
    split_date = group['Date'].max() - test_period
    
    # Split the data for this ticker into training and testing sets
    train_group = group[group['Date'] < split_date]
    test_group = group[group['Date'] >= split_date]
    
    # Append the split data to the respective lists
    train_data.append(train_group)
    test_data.append(test_group)
    
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

In [119]:
X_train = train_df.drop(columns=['Dividend', 'Date'])
y_train = train_df['Dividend'].values

X_test = test_df.drop(columns=['Dividend', 'Date'])
y_test = test_df['Dividend'].values

In [121]:
scaler = MinMaxScaler()
# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include='float'))

# Transform your test data using the same scaler
X_test_scaled = scaler.transform(X_test.select_dtypes(include='float'))

## Model

In [130]:
max_features = 5  # Maximum number of words to get
max_len = 13  # Maximum sequence length
embedding_dim = 50  # Dimensionality of word embeddings

In [None]:
# Pad sequences to a fixed length
# x_train = sequence.pad_sequences(X_train, maxlen=float(max_len))
# x_test = sequence.pad_sequences(X_test, maxlen=float(max_len))

In [131]:
model = Sequential()

model.add(Embedding(max_features,embedding_dim, input_length= max_len))

model.add(GRU(64))

model.add(Dense(32,activation='relu'))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics=['accuracy'], run_eagerly=True)


2023-09-26 21:52:55.873125: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-26 21:52:55.877654: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-26 21:52:55.880486: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [132]:
batch_size = 32
epochs = 100
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train_scaled, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x7f001baf5270>

In [133]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Test loss: 0.3937
Test accuracy: 0.7812


## Model Predicting

In [17]:
def plot_predictions(test,predicted):
    plt.plot(test, color='red',label='Real')
    plt.plot(predicted, color='blue',label='Predicted')
    plt.title('Stock Price Prediction')
    plt.xlabel('Time')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.show()

def return_rmse(test,predicted):
    rmse = math.sqrt(mean_squared_error(test, predicted))
    print("The root mean squared error is {}.".format(rmse))

In [18]:
y = data['stock_price']
X = data.drop(columns='stock_price')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
X_train

Unnamed: 0,date,symbol,Dividend,volume,fed_funds_rate,gdp,debt_to_equity,eps,return_on_equity,quick_ratio,operating_ratio,inventory_turnover,sa_neu,sa_pos,sa_neg
2423,2019-07-01,WBA,0.00,5.415303e+06,2.40,4796.221000,1.863836,0.878151,0.033433,0.363991,0.683468,1.781430,0.293993,0.499660,0.206347
7090,2021-06-01,CDW,0.00,7.714133e+05,0.08,4907.879667,7.572559,1.981923,0.261346,1.202329,0.934571,4.797569,,,
12674,2023-02-01,XOM,0.91,1.661462e+07,4.57,5002.221667,0.831351,2.926529,0.060109,1.088859,0.723615,2.587264,0.338450,0.000039,0.661511
8483,2021-12-01,CPT,0.83,6.803367e+05,0.08,4940.888333,0.883988,2.060968,0.050720,1.727582,0.337403,14.382010,,,
5909,2020-11-01,NUE,0.00,1.913240e+06,0.09,4792.981333,0.834049,1.088756,0.031089,2.299682,0.909128,1.299067,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,2023-01-01,A,0.00,1.235647e+06,4.33,4950.820000,0.946693,1.189189,0.062756,1.532541,0.785304,0.709271,0.001703,0.993029,0.005268
5191,2020-08-01,JKHY,0.00,5.807800e+05,0.10,4748.625333,0.530804,1.061612,0.052570,1.663751,0.762497,-2.164578,,,
5390,2020-09-01,ATVI,0.00,6.269807e+06,0.09,4796.142667,0.500278,0.782383,0.041997,4.101509,0.628692,14.656250,,,
860,2018-12-01,VLO,0.00,4.786477e+06,2.27,4647.263000,1.314811,2.266667,0.043938,1.039071,1.001914,4.368953,0.693603,0.005750,0.300647
