In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.api import STLForecast

In [9]:
### Securities Data

# Date Manipulation 

df = pd.read_csv('data/all_securities.csv', header = 0)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

start_date = '2022-01-01'
end_date_training = '2022-08-31'
start_date_testing= '2022-09-01'
end_date_testing = '2022-09-30'

training_data = df.loc[start_date:end_date_training]
testing_data = df.loc[start_date_testing:end_date_testing]

# Train & Fitt 

X_train=training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close']]
X_test=testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close']]
y_train=training_data['DELL_Close']
y_test=testing_data['DELL_Close']

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

y_pred_test = results.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)

                                 OLS Regression Results                                
Dep. Variable:             DELL_Close   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                          1.201e+04
Date:                Mon, 22 Apr 2024   Prob (F-statistic):                   6.61e-192
Time:                        21:31:03   Log-Likelihood:                         -439.79
No. Observations:                 167   AIC:                                      885.6
Df Residuals:                     164   BIC:                                      894.9
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

In [31]:
import pandas as pd

# Load the all securities data and convert the date
df_ = pd.read_csv('data/all_securities.csv')
df_['Date'] = pd.to_datetime(df_['Date']).dt.date
df_.set_index('Date', inplace=True)

# Load the daily data and set the index
new_data = pd.read_csv('data/daily_data_v2.csv')
new_data['stock_date'] = pd.to_datetime(new_data['stock_date']).dt.date
new_data.set_index('stock_date', inplace=True)

# Join the new feature 'percent_pos' onto the original dataframe
df = df_.join(new_data[['percent_pos']], how='left')

# Define the training and testing period as date objects
start_date = pd.to_datetime('2022-01-01').date()
end_date_training = pd.to_datetime('2022-08-31').date()
start_date_testing = pd.to_datetime('2022-09-01').date()
end_date_testing = pd.to_datetime('2022-09-30').date()

# Split the data into training and testing sets using date filtering properly
training_data = df.loc[start_date:end_date_training]
testing_data = df.loc[start_date_testing:end_date_testing]

# Prepare the independent variables for regression
X_train = training_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos']].dropna()
X_test = testing_data[['VTSAX_Close', 'XLK_Close', 'SNP_Close', 'percent_pos']].dropna()
y_train = training_data['DELL_Close'].loc[X_train.index]  # Align with X_train to avoid NaN issues
y_test = testing_data['DELL_Close'].loc[X_test.index]     # Align with X_test to avoid NaN issues


# Fit the OLS regression model
model = sm.OLS(y_train, X_train)
results = model.fit()

# Print the summary of the model
print(results.summary())

# Predict using the model
y_pred_test = results.predict(X_test)

# Calculate and print the mean squared error on the test data
mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on Test Data:", mse_test)

                                 OLS Regression Results                                
Dep. Variable:             DELL_Close   R-squared (uncentered):                   0.995
Model:                            OLS   Adj. R-squared (uncentered):              0.995
Method:                 Least Squares   F-statistic:                              8954.
Date:                Mon, 22 Apr 2024   Prob (F-statistic):                   7.79e-190
Time:                        21:58:42   Log-Likelihood:                         -439.78
No. Observations:                 167   AIC:                                      887.6
Df Residuals:                     163   BIC:                                      900.0
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

  return self._engine.is_monotonic_increasing
