In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl as px
from sklearn.model_selection import train_test_split

## Data Pre-processing

In [3]:
AAPL = pd.read_excel("./data/AAPL_EQUITY.xlsx")
SPESG = pd.read_excel(('./data/SPESG_Index_5Y.xlsx')).dropna()

# Take SPESG and AAPL and combine them into one data frame: df['Date', 'SPESG', 'AAPL_Px', 'AAPL_Volume', 'AAPL_SMAVG(15)']
df = pd.merge(AAPL, SPESG, on='Date', how='inner')
new_columns = {'Last Px_x' : 'AAPL_Px', 'Volume': 'AAPL_Volume(M)', 'SMAVG(15)' : 'AAPL_SMAVG15(M)', 'Last Px_y': 'SPESG'}
df.rename(columns=new_columns, inplace=True)

# Strip M and convert these columns from object into float64
df['AAPL_Volume(M)'] = df['AAPL_Volume(M)'].str.strip('M').astype('float64')
df['AAPL_SMAVG15(M)'] = df['AAPL_SMAVG15(M)'].str.strip('M').astype('float64')
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Quarter'] = df['Date'].dt.quarter
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.strftime('%A')

## Calculate returns

In [4]:
ret_period = [1, 5, 10, 30, 60, 120, 250]

for p in ret_period:
    df[f'AAPL_Px_ret{p}'] = ((df['AAPL_Px'] - df['AAPL_Px'].shift(-p)) / df['AAPL_Px'].shift(-p)).round(4)

df['AAPL_ret_f1'] = df['AAPL_Px_ret1'].shift(1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274 entries, 0 to 1273
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             1274 non-null   datetime64[ns]
 1   AAPL_Px          1274 non-null   float64       
 2   AAPL_Volume(M)   1274 non-null   float64       
 3   AAPL_SMAVG15(M)  1260 non-null   float64       
 4   SPESG            1274 non-null   float64       
 5   Year             1274 non-null   int32         
 6   Quarter          1274 non-null   int32         
 7   Month            1274 non-null   int32         
 8   Day              1274 non-null   object        
 9   AAPL_Px_ret1     1273 non-null   float64       
 10  AAPL_Px_ret5     1269 non-null   float64       
 11  AAPL_Px_ret10    1264 non-null   float64       
 12  AAPL_Px_ret30    1244 non-null   float64       
 13  AAPL_Px_ret60    1214 non-null   float64       
 14  AAPL_Px_ret120   1154 non-null   float64

In [5]:
weekday_dummies = pd.get_dummies(df['Day']).astype(int)
df = pd.concat([df, weekday_dummies], axis=1)
df = df.dropna()
df.sort_values(by='Date', ascending=True, inplace=True)

X = df.drop(['Date', 'Day', 'Year', 'AAPL_ret_f1'], axis=1)
y = df['AAPL_ret_f1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
X_train.head()

Unnamed: 0,AAPL_Px,AAPL_Volume(M),AAPL_SMAVG15(M),SPESG,Quarter,Month,AAPL_Px_ret1,AAPL_Px_ret5,AAPL_Px_ret10,AAPL_Px_ret30,AAPL_Px_ret60,AAPL_Px_ret120,AAPL_Px_ret250,Friday,Monday,Thursday,Tuesday,Wednesday
627,148.85,56.095,65.534,393.81,4,10,-0.0031,-0.0027,0.0563,-0.0012,0.0101,0.1432,0.2908,0,0,0,0,1
476,145.38,88.57,99.805,359.14,2,6,-0.0386,0.0111,0.0585,-0.1264,-0.1078,-0.1899,0.1471,1,0,0,0,0
981,81.28,104.491,123.868,264.9,2,6,0.0055,0.022,0.0383,0.2115,0.2215,0.211,0.7553,0,0,0,0,1
856,119.05,169.41,100.92,309.03,4,11,0.0211,0.0146,-0.0018,0.0003,-0.0151,0.3496,0.8354,0,1,0,0,0
607,161.94,69.464,76.87,409.75,4,11,0.0033,0.0551,0.0948,0.1492,0.0666,0.2863,0.389,0,0,0,0,1


# Modeling
## Experimenting phase
### Linear Regression

In [6]:
# from sklearn import linear_model
# from sklearn.metrics import mean_squared_error, r2_score

# regressor = linear_model.LinearRegression()
# regressor.fit(X_train, y_train)

# # Training set
# y_pred_train = regressor.predict(X_train)
# r2_train = r2_score(y_train, y_pred_train).round(4)

# # Test set
# y_pred_test = regressor.predict(X_test)
# r2_test = r2_score(y_test, y_pred_test).round(4)

# print(f"r2_train: {r2_train}")
# print(f"r2_test: {r2_test}")

# def adjusted_r2_score(r_squared, n, k):
#     return (1 - ((1 - r_squared) * (n - 1) / (n - k - 1))).round(4)

# n_samples = X_train.shape[0]
# n_features = X_train.shape[1]

# adj_r2_train = adjusted_r2_score(r2_train, n_samples, n_features)
# adj_r2 = adjusted_r2_score(r2_test, n_samples, n_features)
# print("Adjusted R-squared on Training Data:", adj_r2_train)
# print("Adjusted R-squared on Test Data:", adj_r2)

### Bayesian Regression

In [7]:
# from sklearn.linear_model import BayesianRidge
# from sklearn.metrics import mean_squared_error, r2_score

# # Create Bayesian Ridge regression object
# regressor = BayesianRidge()

# # Fit the model
# regressor.fit(X_train, y_train)

# # Predict on training set
# y_pred_train = regressor.predict(X_train)
# r2_train = r2_score(y_train, y_pred_train)

# # Predict on test set
# y_pred_test = regressor.predict(X_test)
# r2_test = r2_score(y_test, y_pred_test)

# print(f"r2_train: {r2_train}")
# print(f"r2_test: {r2_test}")

# # Function to calculate adjusted R^2
# def adjusted_r2_score(r_squared, n, k):
#     return 1 - ((1 - r_squared) * (n - 1) / (n - k - 1))

# n_samples = X_train.shape[0]
# n_features = X_train.shape[1]

# adj_r2_train = adjusted_r2_score(r2_train, n_samples, n_features)
# adj_r2_test = adjusted_r2_score(r2_test, n_samples, n_features)
# print("Adjusted R-squared on Training Data:", adj_r2_train)
# print("Adjusted R-squared on Test Data:", adj_r2_test)


### Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

X = df.drop(['Date', 'Day', 'Year', 'AAPL_ret_f1'], axis=1)
y = df['AAPL_ret_f1']


# Create Random Forest Regressor object
regressor = RandomForestRegressor()

cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='r2')

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)

# Calculate mean and standard deviation of cross-validation scores
print("Mean R-squared:", cv_scores.mean())
print("Standard Deviation of R-squared:", cv_scores.std())

Cross-Validation Scores: [-0.0507099  -0.02806235 -0.30720464 -0.05822736 -0.14245818]
Mean R-squared: -0.11733248582526827
Standard Deviation of R-squared: 0.10255133019771644
