In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [25]:
df = pd.read_csv('combined_dataset.csv')



df['Local Date'] = pd.to_datetime(df['Local Date'])
df = df.sort_values(by='Local Date')

# Assuming the dataset spans a considerable time range, we can pick a cut-off date
# For example, if we want to use the last 20% of dates for testing, we can do the following:
cut_off_index = int(len(df) * 0.8)
cut_off_date = df.iloc[cut_off_index]['Local Date']

# Splitting the data based on the cut-off date
train_df = df[df['Local Date'] < cut_off_date]
test_df = df[df['Local Date'] >= cut_off_date]

# Preparing the feature (X) and target (y) variables
X_train = train_df.drop(columns=['NASDAQ_Volatility', 'Local Date']).values
y_train = train_df['NASDAQ_Volatility'].values
X_test = test_df.drop(columns=['NASDAQ_Volatility', 'Local Date']).values
y_test = test_df['NASDAQ_Volatility'].values

In [28]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
}

# Initialize the grid search model
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), 
                           param_grid=param_grid, 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_params

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END .....................max_depth=10, n_estimators=100; total time= 2.2min
[CV] END .....................max_depth=10, n_estimators=100; total time= 2.2min
[CV] END .....................max_depth=10, n_estimators=100; total time= 2.2min
[CV] END .....................max_depth=10, n_estimators=200; total time= 4.3min
[CV] END .....................max_depth=10, n_estimators=200; total time= 4.3min
[CV] END .....................max_depth=10, n_estimators=200; total time= 4.3min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.3min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.3min
[CV] END .....................max_depth=10, n_estimators=300; total time= 6.2min
[CV] END .....................max_depth=10, n_estimators=300; total time= 6.4min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.1min
[CV] END .....................max_depth=10, n_est

{'max_depth': 10, 'n_estimators': 200}

In [29]:
y_pred = grid_search.predict(X_test)

In [30]:
r_squared = r2_score(y_test, y_pred)
print(f'R-squared on Test Set: {r_squared}')


R-squared on Test Set: 0.4105191253857984
