In [4]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np


data = pd.read_csv('combined_dataset.csv')

Y = data['NASDAQ_Volatility']
X = data.select_dtypes(include=[np.number]).drop(columns=['NASDAQ_Volatility'])

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)


mse = mean_squared_error(Y_test, Y_pred)
n_components = pca.n_components_
r_squared = model.score(X_test, Y_test)

print("Mean Squared Error:", mse)
print("Number of PCA Components:", n_components)
print("R_squared:", r_squared)


Mean Squared Error: 1.8987710661478735e-07
Number of PCA Components: 80
R_squared: 0.5664678606131284
