In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl as px
from sklearn.model_selection import train_test_split

AAPL = pd.read_excel("./data/AAPL_EQUITY.xlsx")
SPESG = pd.read_excel(('./data/SPESG_Index_5Y.xlsx')).dropna()

# Take SPESG and AAPL and combine them into one data frame: df['Date', 'SPESG', 'AAPL_Px', 'AAPL_Volume', 'AAPL_SMAVG(15)']
df = pd.merge(AAPL, SPESG, on='Date', how='inner')
new_columns = {'Last Px_x' : 'AAPL_Px', 'Volume': 'AAPL_Volume(M)', 'SMAVG(15)' : 'AAPL_SMAVG15(M)', 'Last Px_y': 'SPESG'}
df.rename(columns=new_columns, inplace=True)

# Strip M and convert these columns from object into float64
df['AAPL_Volume(M)'] = df['AAPL_Volume(M)'].str.strip('M').astype('float64').shift(-1)
df['AAPL_SMAVG15(M)'] = df['AAPL_SMAVG15(M)'].str.strip('M').astype('float64').shift(-1)
df['SPESG-1'] = df['SPESG'].shift(-1)
df['AAPL_Px_Shift-1'] = df['AAPL_Px'].shift(-1)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Quarter'] = df['Date'].dt.quarter
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.strftime('%A')

df['AAPL_ret'] = (df['AAPL_Px'] - df['AAPL_Px_Shift-1']) / df['AAPL_Px_Shift-1']
df['AAPL_ret_perc'] = (df['AAPL_ret'] * 100).round(3)

weekday_dummies = pd.get_dummies(df['Day']).astype(int)
df = pd.concat([df, weekday_dummies], axis=1)
df = df.dropna()
df.sort_values(by='Date', ascending=True, inplace=True)

# features = ['AAPL_Volume(M)', 'AAPL_SMAVG15(M)', 'SPESG', 'Month', 'Year', 'Day']
X = df.drop(['Date', 'AAPL_ret','AAPL_ret_perc', 'Day', 'SPESG', 'Year'], axis=1)
y = df['AAPL_ret_perc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
X_train.shape

(629, 12)

In [32]:
print(y_train.shape, y_test.shape)

(629,) (630,)


In [33]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

regressor = linear_model.LinearRegression()
regressor.fit(X_train, y_train)

# Training set
y_pred_train = regressor.predict(X_train)
r2_train = r2_score(y_train, y_pred_train)

# Test set
y_pred_test = regressor.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"r2_train: {r2_train}")
print(f"r2_test: {r2_test}")

def adjusted_r2_score(r_squared, n, k):
    return 1 - ((1 - r_squared) * (n - 1) / (n - k - 1))

n_samples = X_train.shape[0]
n_features = X_train.shape[1]

adj_r2_train = adjusted_r2_score(r2_train, n_samples, n_features)
adj_r2 = adjusted_r2_score(r2_test, n_samples, n_features)
print("Adjusted R-squared on Training Data:", adj_r2_train)
print("Adjusted R-squared on Test Data:", adj_r2)

r2_train: 0.8932114084227638
r2_test: 0.8669541871405926
Adjusted R-squared on Training Data: 0.8911311111842463
Adjusted R-squared on Test Data: 0.8643623855913833


In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming y_true_train, y_pred_train, y_true_test, y_pred_test are your actual and predicted values
# for both the training and test sets

# Mean Absolute Error (MAE)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

# Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

# Root Mean Squared Error (RMSE)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

# R2 Score
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("MAE - Train: {:.4f}, Test: {:.4f}".format(mae_train, mae_test))
print("MSE - Train: {:.4f}, Test: {:.4f}".format(mse_train, mse_test))
print("RMSE - Train: {:.4f}, Test: {:.4f}".format(rmse_train, rmse_test))
print("R2 Score - Train: {:.4f}, Test: {:.4f}".format(r2_train, r2_test))

MAE - Train: 0.3485, Test: 0.4202
MSE - Train: 0.3460, Test: 0.6378
RMSE - Train: 0.5883, Test: 0.7986
R2 Score - Train: 0.8932, Test: 0.8670


In [70]:
X_test.head()

Unnamed: 0,AAPL_Volume(M),AAPL_SMAVG15(M),SPESG-1,AAPL_Px_Shift-1,Year,Quarter,Month,Friday,Monday,Thursday,Tuesday,Wednesday
1182,146.19,141.621,237.25,50.688,2019,3,8,0,0,1,0,0
232,50.747,57.523,367.36,171.56,2023,2,5,0,0,0,0,1
1215,83.599,96.118,243.01,49.935,2019,2,6,1,0,0,0,0
592,131.063,120.621,410.26,179.3,2021,4,12,0,0,1,0,0
729,59.279,78.056,358.08,125.06,2021,2,6,0,0,1,0,0


In [71]:
MSFT = pd.read_excel('data/MSFT_EQUITY.xlsx')
MSFT['Volume'] = MSFT['Volume'].str.strip('M').astype('float64').shift(-1)
MSFT['SMAVG(15)'] = MSFT['SMAVG(15)'].str.strip('M').astype('float64').shift(-1)
MSFT['Date'] = pd.to_datetime(MSFT['Date'])
MSFT['Year'] = MSFT['Date'].dt.year
MSFT['Quarter'] = MSFT['Date'].dt.quarter
MSFT['Month'] = MSFT['Date'].dt.month
MSFT['Day'] = MSFT['Date'].dt.strftime("%A")
df = pd.merge(MSFT, SPESG, on='Date', how='inner')
new_columns = {'Last Px_x': 'AAPL_Px', 'Last Px_y': 'SPESG', 'SMAVG(15)':'AAPL_SMAVG15(M)', 'Volume':'AAPL_Volume(M)'}
df.columns = df.columns.str.strip()
df.rename(columns=new_columns, inplace=True)

df['AAPL_Px_Shift-1'] = df['AAPL_Px'].shift(-1)
df['SPESG-1'] = df['SPESG'].shift(-1)

weekday_dummies = pd.get_dummies(df['Day']).astype(int)
df = pd.concat([df, weekday_dummies], axis=1)
df = df.dropna()
df.sort_values(by='Date', ascending=True, inplace=True)
X = df.drop(['Date', 'AAPL_Px', 'Day', 'SPESG'], axis=1)
y = df['AAPL_Px']

# y_pred = regressor.predict(X)

In [72]:
# Define the feature names in the same order as they were used during model training
feature_names = ['AAPL_Px_Shift-1', 'SPESG-1', 'AAPL_SMAVG15(M)', 'AAPL_Volume(M)', 'Year', 'Quarter', 'Month', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

# Create a new DataFrame with columns in the desired order
df_reordered = df[['Date', 'AAPL_Px', 'Day', 'SPESG'] + feature_names]

# Drop rows with NaN values
df_reordered = df_reordered.dropna()

# Sort the DataFrame by 'Date' column
df_reordered.sort_values(by='Date', ascending=True, inplace=True)

# Separate features (X) and target variable (y)
X_reordered = df_reordered.drop(['Date', 'AAPL_Px', 'Day', 'SPESG'], axis=1)
y_reordered = df_reordered['AAPL_Px']

# Make predictions using the reordered features
y_pred_reordered = regressor.predict(X_reordered)


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
