In [1]:
import pandas as pd
import numpy as np
import joblib

import glob
from datetime import date
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
articles = pd.read_csv('../data/final/articles_with_llm_outputs_and_impact_score.csv')
articles = articles[['CID', 'Date', 'sentiment', 'stock_movement']]
articles['Date'] = pd.to_datetime(articles['Date'])
articles.loc[articles['stock_movement'] == 'Up', 'stock_movement'] = 1
articles.loc[articles['stock_movement'] == 'Neutral', 'stock_movement'] = 0
articles.loc[articles['stock_movement'] == 'Down', 'stock_movement'] = -1

predictions = [pd.read_csv(x) for x in glob.glob('../outputs/lstm_predictions/*.csv')]
lstm_preds = pd.concat(predictions)
lstm_preds['Date'] = pd.to_datetime(lstm_preds['Date'])

stock_df = pd.read_json('../data/raw/StockSeries.json', orient='records')
stock_df = stock_df[['CID', 'Date', 'Close']]

In [4]:
cids = articles['CID'].unique()
lstm_pred_lag = []
stock_pred_lag = []

for cid in cids:
    filter_df = lstm_preds[lstm_preds['CID'] == cid].sort_values(by='Date', ascending=False)
    filter_df = filter_df[filter_df['lstm_predictions'] != -1]

    preds = filter_df['lstm_predictions'].values[:-1]
    filter_df = filter_df.iloc[1:, :]
    filter_df['lstm_predictions'] = preds

    lstm_pred_lag.append(filter_df)    


    filter_df = stock_df[stock_df['CID'] == cid].sort_values(by='Date', ascending=False)
    close_val = filter_df['Close'].values[:-1]
    filter_df = filter_df.iloc[1:, :]
    filter_df['Close'] = close_val

    stock_pred_lag.append(filter_df)

lstm_preds = pd.concat(lstm_pred_lag)
stock_df = pd.concat(stock_pred_lag)

In [5]:
df_x = articles.merge(lstm_preds, on=['CID', 'Date']).merge(stock_df, on=['CID', 'Date'])
df_x = df_x.dropna(subset='stock_movement')

In [6]:
filter_df = df_x[df_x['CID'] == cid]
train = filter_df[filter_df['Date'].dt.date < date(2023, 8, 1)]
test = filter_df[filter_df['Date'].dt.date >= date(2023, 8, 1)]

scaler = MinMaxScaler(feature_range=(0, 1))

x_train = train.drop(columns=['Close', 'Date', 'CID']).values
x_test = test.drop(columns=['Close', 'Date', 'CID']).values

y_train = train['Close'].values
y_test = test['Close'].values

lstm_train = train['lstm_predictions'].values
lstm_test = test['lstm_predictions'].values

y_train = np.reshape(y_train, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))

lstm_train = np.reshape(lstm_train, (-1, 1))
lstm_test = np.reshape(lstm_test, (-1, 1))

y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

In [8]:
x_train

array([[-0.5, -1, 12.730692863464355],
       [0.7, -1, 12.973259925842283],
       [0.8, 1, 12.973259925842283],
       [-0.8, -1, 12.421753883361816],
       [0.8, 1, 17.23299789428711],
       [0.8, 1, 17.21965217590332],
       [0.8, 1, 17.21965217590332],
       [0.8, 1, 11.805337905883787],
       [0.8, 1, 12.225615501403809],
       [-0.8, -1, 14.177929878234863],
       [-0.8, -1, 11.283461570739746],
       [-0.8, -1, 16.181095123291016],
       [0.8, 1, 12.34268569946289],
       [-0.5, -1, 12.14716625213623],
       [-0.5, -1, 11.62037754058838],
       [-0.8, -1, 14.904192924499512],
       [0.5, 1, 12.123376846313477],
       [-0.8, -1, 12.123376846313477],
       [-0.7, -1, 11.904984474182127],
       [-0.8, -1, 11.904984474182127],
       [-0.8, -1, 11.973865509033203],
       [-0.5, -1, 11.422527313232422],
       [-0.5, -1, 11.484522819519045],
       [0.8, 1, 11.484522819519045],
       [0.8, 1, 11.559318542480469],
       [0.5, 1, 11.559318542480469],
       [-0.5, -

In [5]:
metrics = {
    'CID': [],
    'train_mae': [],
    'test_mae': [],
    'train_mse': [],
    'test_mse': [],
    'train_mape': [],
    'test_mape': [],

    'lstm_train_mae': [],
    'lstm_test_mae': [],
    'lstm_train_mse': [],
    'lstm_test_mse': [],
    'lstm_train_mape': [],
    'lstm_test_mape': [],
}
for cid in cids:
    filter_df = df_x[df_x['CID'] == cid]
    train = filter_df[filter_df['Date'].dt.date < date(2023, 8, 1)]
    test = filter_df[filter_df['Date'].dt.date >= date(2023, 8, 1)]

    scaler = MinMaxScaler(feature_range=(0, 1))

    x_train = train.drop(columns=['Close', 'Date', 'CID']).values
    x_test = test.drop(columns=['Close', 'Date', 'CID']).values

    y_train = train['Close'].values
    y_test = test['Close'].values

    lstm_train = train['lstm_predictions'].values
    lstm_test = test['lstm_predictions'].values

    y_train = np.reshape(y_train, (-1, 1))
    y_test = np.reshape(y_test, (-1, 1))

    lstm_train = np.reshape(lstm_train, (-1, 1))
    lstm_test = np.reshape(lstm_test, (-1, 1))

    y_train = scaler.fit_transform(y_train)
    y_test = scaler.transform(y_test)

    joblib.dump(scaler, f'../models/Ensemble/scalers/{cid}.pkl')    

    model = LinearRegression()
    model.fit(x_train, y_train)

    joblib.dump(model, f'../models/Ensemble/lr_models/{cid}.pkl')    

    # Making predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    y_train_pred = scaler.inverse_transform(y_train_pred)
    y_test_pred = scaler.inverse_transform(y_test_pred)

    test_mae = mean_absolute_error(scaler.inverse_transform(y_test), y_test_pred)
    test_mse = mean_squared_error(scaler.inverse_transform(y_test), y_test_pred, squared=False)
    train_mae = mean_absolute_error(scaler.inverse_transform(y_train), y_train_pred)
    train_mse = mean_squared_error(scaler.inverse_transform(y_train), y_train_pred, squared=False)
    print('Train Mean Absolute Error:', train_mae)
    print('Train Root Mean Squared Error:', train_mse)
    print('Test Mean Absolute Error:', test_mae)
    print('Test Root Mean Squared Error:', test_mse)

    train_mape = mean_absolute_percentage_error(scaler.inverse_transform(y_train), y_train_pred)
    test_mape = mean_absolute_percentage_error(scaler.inverse_transform(y_test), y_test_pred)
    print("Train MAPE: {}, Test MAPE: {}".format(train_mape, test_mape))

    metrics['CID'].append(cid)
    metrics['train_mae'].append(train_mae)
    metrics['test_mae'].append(test_mae)
    metrics['train_mse'].append(train_mse)
    metrics['test_mse'].append(test_mse)
    metrics['train_mape'].append(train_mape)
    metrics['test_mape'].append(test_mape)


    # LSTM PREDS
    print('\n\nLSTM PREDS')
    lstm_test_mae = mean_absolute_error(scaler.inverse_transform(y_test), lstm_test)
    lstm_test_mse = mean_squared_error(scaler.inverse_transform(y_test), lstm_test, squared=False)
    lstm_train_mae = mean_absolute_error(scaler.inverse_transform(y_train), lstm_train)
    lstm_train_mse = mean_squared_error(scaler.inverse_transform(y_train), lstm_train, squared=False)
    print('Train Mean Absolute Error:', train_mae)
    print('Train Root Mean Squared Error:', train_mse)
    print('Test Mean Absolute Error:', test_mae)
    print('Test Root Mean Squared Error:', test_mse)

    lstm_train_mape = mean_absolute_percentage_error(scaler.inverse_transform(y_train), lstm_train)
    lstm_test_mape = mean_absolute_percentage_error(scaler.inverse_transform(y_test), lstm_test)
    print("Train MAPE: {}, Test MAPE: {}".format(train_mape, test_mape))

    metrics['lstm_train_mae'].append(lstm_train_mae)
    metrics['lstm_test_mae'].append(lstm_test_mae)
    metrics['lstm_train_mse'].append(lstm_train_mse)
    metrics['lstm_test_mse'].append(lstm_test_mse)
    metrics['lstm_train_mape'].append(lstm_train_mape)
    metrics['lstm_test_mape'].append(lstm_test_mape)
pd.DataFrame(data=metrics).to_csv('../outputs/metrics/ensemble.csv', index=False)

Train Mean Absolute Error: 2.2019109740205134
Train Root Mean Squared Error: 2.849553478941283
Test Mean Absolute Error: 2.0232073104688517
Test Root Mean Squared Error: 2.3390159325302835
Train MAPE: 0.017121880045615396, Test MAPE: 0.01331597132732499


LSTM PREDS
Train Mean Absolute Error: 2.2019109740205134
Train Root Mean Squared Error: 2.849553478941283
Test Mean Absolute Error: 2.0232073104688517
Test Root Mean Squared Error: 2.3390159325302835
Train MAPE: 0.017121880045615396, Test MAPE: 0.01331597132732499
Train Mean Absolute Error: 1.2465707525504723
Train Root Mean Squared Error: 1.455285029570487
Test Mean Absolute Error: 1.7009416508492654
Test Root Mean Squared Error: 2.102733809733912
Train MAPE: 0.014541276223509288, Test MAPE: 0.021887628006766382


LSTM PREDS
Train Mean Absolute Error: 1.2465707525504723
Train Root Mean Squared Error: 1.455285029570487
Test Mean Absolute Error: 1.7009416508492654
Test Root Mean Squared Error: 2.102733809733912
Train MAPE: 0.0145412762

#### Analysis

In [6]:
res = pd.read_csv('../outputs/metrics/ensemble.csv')