In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.api import STLForecast

In [47]:
### Securities Data

# Date Manipulation 

df = pd.read_csv('data/all_securities.csv', header = 0)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

start_date = '2022-01-01'
end_date_training = '2022-08-31'
start_date_testing= '2022-09-01'
end_date_testing = '2022-09-30'

training_data = df.loc[start_date:end_date_training]
testing_data = df.loc[start_date_testing:end_date_testing]

# Train & Fitt 

X_train=training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close']]
X_test=testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close']]
y_train=training_data['DELL_Close']
y_test=testing_data['DELL_Close']

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

y_pred_test = results.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)

                                 OLS Regression Results                                
Dep. Variable:             DELL_Close   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                          1.201e+04
Date:                Thu, 25 Apr 2024   Prob (F-statistic):                   6.61e-192
Time:                        21:26:28   Log-Likelihood:                         -439.79
No. Observations:                 167   AIC:                                      885.6
Df Residuals:                     164   BIC:                                      894.9
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

# Pos Only - Day


In [48]:
import pandas as pd

df_ = pd.read_csv('data/all_securities.csv')
df_['Date'] = pd.to_datetime(df_['Date']).dt.date
df_.set_index('Date', inplace=True)

new_data = pd.read_csv('data/daily_data_v2.csv')
new_data['stock_date'] = pd.to_datetime(new_data['stock_date']).dt.date
new_data.set_index('stock_date', inplace=True)

df = df_.join(new_data[['percent_pos']], how='left')

start_date = pd.to_datetime('2022-01-01').date()
end_date_training = pd.to_datetime('2022-08-31').date()
start_date_testing = pd.to_datetime('2022-09-01').date()
end_date_testing = pd.to_datetime('2022-09-30').date()

training_data = df.loc[start_date:end_date_training]
testing_data = df.loc[start_date_testing:end_date_testing]

X_train = training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos']].dropna()
X_test = testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos']].dropna()
y_train = training_data['DELL_Close'].loc[X_train.index] 
y_test = testing_data['DELL_Close'].loc[X_test.index]    

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

y_pred_test = results.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)

                                 OLS Regression Results                                
Dep. Variable:             DELL_Close   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                              8954.
Date:                Thu, 25 Apr 2024   Prob (F-statistic):                   7.79e-190
Time:                        21:26:28   Log-Likelihood:                         -439.78
No. Observations:                 167   AIC:                                      887.6
Df Residuals:                     163   BIC:                                      900.0
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

  return self._engine.is_monotonic_increasing


# Pos and Neutral - Day

- notes oddly we see quite the jump in MSE

In [49]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

df_ = pd.read_csv('data/all_securities.csv')
df_['Date'] = pd.to_datetime(df_['Date']).dt.date
df_.set_index('Date', inplace=True)

new_data = pd.read_csv('data/daily_data_v2.csv')
new_data['stock_date'] = pd.to_datetime(new_data['stock_date']).dt.date
new_data.set_index('stock_date', inplace=True)

df = df_.join(new_data[['percent_pos', 'percent_neu']], how='left')

start_date = pd.to_datetime('2022-01-01').date()
end_date_training = pd.to_datetime('2022-08-31').date()
start_date_testing = pd.to_datetime('2022-09-01').date()
end_date_testing = pd.to_datetime('2022-09-30').date()

training_data = df.loc[start_date:end_date_training]
testing_data = df.loc[start_date_testing:end_date_testing]

X_train = training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos', 'percent_neu']].dropna()
X_test = testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos', 'percent_neu']].dropna()
y_train = training_data['DELL_Close'].loc[X_train.index] 
y_test = testing_data['DELL_Close'].loc[X_test.index]    

model = sm.OLS(y_train, sm.add_constant(X_train)) 
results = model.fit()

print(results.summary())

y_pred_test = results.predict(sm.add_constant(X_test)) 

mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)


  return self._engine.is_monotonic_increasing


                            OLS Regression Results                            
Dep. Variable:             DELL_Close   R-squared:                       0.657
Model:                            OLS   Adj. R-squared:                  0.647
Method:                 Least Squares   F-statistic:                     61.75
Date:                Thu, 25 Apr 2024   Prob (F-statistic):           1.10e-35
Time:                        21:26:28   Log-Likelihood:                -435.14
No. Observations:                 167   AIC:                             882.3
Df Residuals:                     161   BIC:                             901.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          20.2176      7.151      2.827      

### Pos Neutral Weekly

In [72]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

df_ = pd.read_csv('data/all_securities.csv')
df_['Date'] = pd.to_datetime(df_['Date'])
df_['week'] = df_['Date'].dt.isocalendar().week  
df_.set_index('week', inplace=True)

new_data = pd.read_csv('data/weekly_data_v2.csv')
new_data.set_index('week', inplace=True)

df = df_.join(new_data[['percent_pos', 'percent_neu']], how='left')

start_week = pd.to_datetime('2022-01-10 16:00').isocalendar()[1]
end_week_training = pd.to_datetime('2022-08-31 16:00').isocalendar()[1]
start_week_testing = pd.to_datetime('2022-09-01 16:00').isocalendar()[1]
end_week_testing = pd.to_datetime('2022-09-09 16:00').isocalendar()[1]

training_data = df.loc[start_week:end_week_training]
testing_data = df.loc[start_week_testing:end_week_testing]

X_train = training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos', 'percent_neu']].dropna()
y_train = training_data['DELL_Close'].dropna()
y_train = y_train.reindex(X_train.index)  

X_test = testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos', 'percent_neu']].dropna()
y_test = testing_data['DELL_Close'].dropna()
y_test = y_test.reindex(X_test.index)  

model = sm.OLS(y_train, sm.add_constant(X_train))
results = model.fit()

print(results.summary())

y_pred_test = results.predict(sm.add_constant(X_test))
mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)


                            OLS Regression Results                            
Dep. Variable:             DELL_Close   R-squared:                       0.670
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     64.20
Date:                Thu, 25 Apr 2024   Prob (F-statistic):           2.65e-36
Time:                        21:51:56   Log-Likelihood:                -424.53
No. Observations:                 164   AIC:                             861.1
Df Residuals:                     158   BIC:                             879.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          23.3659      7.089      3.296      