In [None]:
# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import os

# Set paths for feature data and labels
features_path = '../Data/Features/'  # Path to the extracted features directory
features_files = os.listdir(features_path)

# Load the features and labels data (adjust the file names accordingly)
feature_data = pd.read_csv(os.path.join(features_path, features_files[0]))  # Adjust if necessary
labels_data = pd.read_csv(os.path.join(features_path, 'labels.csv'))  # Adjust this if your labels file is different

# Split the data into features (X) and labels (y)
X = feature_data
y = labels_data['timing']  # Adjust column name for the timing or target label

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = xgb.XGBRegressor()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
}
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

# Save the best model to the Models directory
import joblib
joblib.dump(best_model, '../Models/xgboost_model.json')  # Saving the model in the Models directory

# Visualize predictions vs actual
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()
