In [13]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np


data = pd.read_csv('combined_dataset.csv')

# Sort the dataset by date and split the dataset
data_sorted = data.sort_values(by='Local Date')
split_index = int(len(data_sorted) * 0.7)
train_data = data_sorted.iloc[:split_index]
test_data = data_sorted.iloc[split_index:]


Y_train = train_data['NASDAQ_Volatility']
X_train = train_data.select_dtypes(include=[np.number]).drop(columns=['NASDAQ_Volatility'])

Y_test = test_data['NASDAQ_Volatility']
X_test = test_data.select_dtypes(include=[np.number]).drop(columns=['NASDAQ_Volatility'])

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Applying PCA for dimensionality reduction
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

model = LinearRegression()
model.fit(X_train_pca, Y_train)

Y_pred = model.predict(X_test_pca)


mse = mean_squared_error(Y_test, Y_pred)
n_components = pca.n_components_
r_squared = model.score(X_test_pca, Y_test)

print("Mean Squared Error:", mse)
print("Number of PCA Components:", n_components)
print("R_squared:", r_squared)


Mean Squared Error: 1.4009436075889294e-07
Number of PCA Components: 79
R_squared: 0.5307014416166755
